telefonicaid · frbattid · May 22, 2015 · May 14, 2015 · May 21, 2015 · May 21, 2015
diff --git a/CHANGES_NEXT_RELEASE b/CHANGES_NEXT_RELEASE
@@ -4,3 +4,4 @@
 * [FEATURE] Version information provided using the /version URL path (#16)
 * [FEATURE] Including attribute type information when retrieving raw data (#54)
 * [FEATURE] dateFrom and dateTo as optional parameters in queries (#53)
+* [FEATURE] Generate collection names using a hash function (#83)
diff --git a/README.md b/README.md
@@ -295,13 +295,17 @@ a counter used as the suffix for the log file name. Optional. Default value: "0"
 - LOG_DIR: The path to a directory where the log file will be searched for or created if it does not exist. Optional. Default value: "./log".
 - LOG_FILE_NAME: The name of the file where the logs will be stored. Optional. Default value: "sth_app.log".
 - PROOF_OF_LIFE_INTERVAL: The time in seconds between proof of life logging messages informing that the server is up and running normally. Default value: "60".
-- DB_PREFIX: The prefix to be added to the service for the creation of the databases. More information below. Optional. Default value: "sth".
+- DB_PREFIX: The prefix to be added to the service for the creation of the databases. More information below. Optional. Default value: "sth_".
 - DEFAULT_SERVICE: The service to be used if not sent by the Orion Context Broker in the notifications. Optional. Default value: "orion".
-- COLLECTION_PREFIX: The prefix to be added to the collections in the databases. More information below. Optional. Default value: "sth".
+- COLLECTION_PREFIX: The prefix to be added to the collections in the databases. More information below. Optional. Default value: "sth_".
 - DEFAULT_SERVICE_PATH: The service path to be used if not sent by the Orion Context Broker in the notifications. Optional. Default value: "/".
 - POOL_SIZE: The default MongoDB pool size of database connections. Optional. Default value: "5".
 - WRITE_CONCERN: The <a href="http://docs.mongodb.org/manual/core/write-concern/" target="_blank">write concern policy</a> to apply when writing data to the MongoDB database. Default value: "1".
 - SHOULD_STORE: Flag indicating if the raw and/or aggregated data should be persisted. Valid values are: "only-raw", "only-aggregated" and "both". Default value: "both".
+- SHOULD_HASH: Flag indicating if the raw and/or aggregated data collection names should include a hash portion. This is mostly
+due to MongoDB's limitation regarding the number of bytes a namespace may have (currently limited to 120 bytes). In case of hashing,
+information about the final collection name and its correspondence to each concrete service path, entity and (if applicable) attribute
+is stored in a collection named `COLLECTION_PREFIX + "collection_names"`. Default value: "true".
 - DATA_MODEL: The data model to use. Currently 3 possible values are supported: collection-per-service-path (which creates a MongoDB collection
  per service patch to store the data), collection-per-entity (which creates a MongoDB collection per service path and entity to store the data)
  and collection-per-attribute (which creates a collection per service path, entity and attribute to store the data). More information about these
@@ -345,6 +349,28 @@ the attribute type does not have any special semantic or effect currently.
 As already mentioned, all this configuration parameters can also be adjusted using the
 [`config.js`](https://github.com/telefonicaid/IoT-STH/blob/develop/config.js) file whose contents are self-explanatory.
 
+It is important to note that there is a limitation of 120 bytes for the namespaces (concatenation of the database name and
+collection names) in MongoDB (see <a href="http://docs.mongodb.org/manual/reference/limits/#namespaces" target="_blank">http://docs.mongodb.org/manual/reference/limits/#namespaces</a>
+for further information). Related to this, the STH generates the collection names using 2 possible mechanisms:
+
+1. <u>Plain text</u>: In case the `SHOULD_HASH` configuration parameter is set to 'false', the collection names are
+generated as a concatenation of the `COLLECTION_PREFIX` plus the service path (in case of the collection-per-service-path
+data model) plus the entity id plus the entity type (in case of the collection-per-entity data model) plus the attribute name
+(in case of the collection-per-attribute data model) plus '.aggr' for the collections of the aggregated data. The length
+of the collection name plus the `DB_PREFIX` plus the database name (or service) should not be more than 120 bytes using UTF-8
+format or MongoDB will complain and will not create the collection, and consequently no data would be stored by the STH.
+
+2. <u>Hash based</u>: In case the `SHOULD_HASH` option is set to something distinct from 'false' (the default option), the
+collection names are generated as a concatenation of the `COLLECTION_PREFIX` plus a generated hash plus '.aggr' for the
+collections of the aggregated data. To avoid collisions in the generation of these hashes, they are forced to be 20 bytes
+long at least. Once again, the length of the collection name plus the `DB_PREFIX` plus the database name (or service) should not
+be more than 120 bytes using UTF-8 or MongoDB will complain and will not create the collection, and consequently no data
+would be stored by the STH. The hash function used is SHA-512.
+
+In case of using hashes as part of the collection names and to let the user or developer easily recover this information,
+a collection named ```DB_COLLECTION_PREFIX + _collection_name``` is created and fed with information regarding the mapping
+of the collection names and the combination of concrete services, service paths, entities and attributes.
+
 [Top](#section0)
 
 ##<a id="section5"></a> Inserting data (random single events and its aggregated data) into the database

diff --git a/config.js b/config.js
@@ -33,10 +33,10 @@ config.database = {
   // The name of the replica set to connect to, if any. Default value: "".
   replicaSet: '',
   // The prefix to be added to the service for the creation of the databases. Default value: "sth".
-  prefix: 'sth',
+  prefix: 'sth_',
   // The prefix to be added to the collections in the databases. More information below.
   //  Default value: "sth".
-  collectionPrefix: 'sth',
+  collectionPrefix: 'sth_',
   // The default MongoDB pool size of database connections. Optional. Default value: "5".
   poolSize: '5',
   // The write concern (see http://docs.mongodb.org/manual/core/write-concern/) to apply when
@@ -45,6 +45,12 @@ config.database = {
   // Flag indicating if the raw and/or aggregated data should be persisted. Valid values are:
   //  "only-raw", "only-aggregated" and "both". Default value: "both".
   shouldStore: 'both',
+  // Flag indicating if the raw and/or aggregated data collection names should include a hash portion.
+  //  This is mostly due to MongoDB's limitation regarding the number of bytes a namespace may have
+  //  (currently limited to 120 bytes). In case of hashing, information about the final collection name
+  //  and its correspondence to each concrete service path, entity and (if applicable) attribute
+  //  is stored in a collection named `COLLECTION_PREFIX + "collection_names"`. Default value: "true".
+  shouldHash: 'true',
   // The data model to use. Currently 3 possible values are supported: collection-per-service-path
   //  (which creates a MongoDB collection per service patch to store the data), collection-per-entity
   //  (which creates a MongoDB collection per service path and entity to store the data) and

diff --git a/package.json b/package.json
@@ -13,6 +13,7 @@
   },
   "dependencies": {
     "boom": "^2.7.1",
+    "bytes-counter": "^1.0.0",
     "good": "^5.1.2",
     "good-console": "^4.1.0",
     "good-file": "^4.0.2",

diff --git a/src/sth_configuration.js b/src/sth_configuration.js
@@ -40,13 +40,14 @@
       NOT_AVAILABLE: 'NA',
       SHUTDOWN: 'OPER_STH_SHUTDOWN',
       DB_CONN_OPEN: 'OPER_STH_DB_CONN_OPEN',
+      DB_LOG: 'OPER_STH_DB_LOG',
       DB_CONN_CLOSE: 'OPER_STH_DB_CONN_CLOSE',
       SERVER_START: 'OPER_STH_SERVER_START',
       SERVER_LOG: 'OPER_STH_SERVER_LOG',
       SERVER_STOP: 'OPER_STH_SERVER_STOP'
     },
-    DB_PREFIX: ENV.DB_PREFIX || config.database.prefix || 'sth',
-    COLLECTION_PREFIX: ENV.COLLECTION_PREFIX || config.database.collectionPrefix || 'sth',
+    DB_PREFIX: ENV.DB_PREFIX || config.database.prefix || 'sth_',
+    COLLECTION_PREFIX: ENV.COLLECTION_PREFIX || config.database.collectionPrefix || 'sth_',
     DATA_MODELS: {
       COLLECTIONS_PER_SERVICE_PATH: 'collection-per-service-path',
       COLLECTIONS_PER_ENTITY: 'collection-per-entity',
@@ -140,6 +141,13 @@
   } else {
     module.exports.DATA_MODEL = 'collection-per-entity';
   }
+  if (ENV.SHOULD_HASH) {
+    module.exports.SHOULD_HASH = ENV.SHOULD_HASH !== 'false';
+  } else if (config.database.shouldHash) {
+    module.exports.SHOULD_HASH = config.database.shouldHash !== 'false';
+  } else {
+    module.exports.SHOULD_HASH = true;
+  }
   module.exports.DB_USERNAME = dbUsername;
   module.exports.DB_PASSWORD = dbPassword;
   module.exports.DB_AUTHENTICATION = (dbUsername && dbPassword) ?

diff --git a/src/sth_database.js b/src/sth_database.js
@@ -4,9 +4,15 @@
   "use strict";
 
   var mongoose = require('mongoose');
+  var boom = require('boom');
+  var crypto = require('crypto');
+  var bytesCounter = require('bytes-counter');
 
   var sthConfig, sthLogger, sthHelper, connectionURL, eventSchema, aggregatedSchema;
 
+  var MAX_NAMESPACE_SIZE_IN_BYTES = 120,
+      MIN_HASH_SIZE_IN_BYTES = 20;
+
   /**
    * Declares the Mongoose schemas.
    */
@@ -169,52 +175,117 @@
    * @return {string} The database name
    */
   function getDatabase(service) {
-    return sthConfig.DB_PREFIX + '_' + service;
+    return sthConfig.DB_PREFIX + service;
   }
 
   /**
    * Return the name of the collection which will store the raw events
+   * @param {string} databaseName The database name
    * @param {string} servicePath The service path of the entity the event is related to
    * @param {string} entityId The entity id related to the event
    * @param {string} entityType The type of entity related to the event
    * @param {string} attrName The attribute id related to the event
    * @returns {string} The collection name
    */
-  function getCollectionName4Events(servicePath, entityId, entityType, attrName) {
+  function getCollectionName4Events(databaseName, servicePath, entityId, entityType, attrName) {
+    var collectionName4Events;
     switch(sthConfig.DATA_MODEL) {
       case sthConfig.DATA_MODELS.COLLECTIONS_PER_SERVICE_PATH:
-        return sthConfig.COLLECTION_PREFIX + '_' + servicePath;
+        collectionName4Events = servicePath;
+        break;
       case sthConfig.DATA_MODELS.COLLECTIONS_PER_ENTITY:
-        return sthConfig.COLLECTION_PREFIX + '_' + servicePath + '_' + entityId + (entityType ? '_' + entityType : '');
+        collectionName4Events =  servicePath + '_' + entityId + (entityType ? '_' + entityType : '');
+        break;
       case sthConfig.DATA_MODELS.COLLECTIONS_PER_ATTRIBUTE:
-        return sthConfig.COLLECTION_PREFIX + '_' + servicePath + '_' + entityId + (entityType ? '_' + entityType : '') +
+        collectionName4Events =  servicePath + '_' + entityId + (entityType ? '_' + entityType : '') +
           '_' + attrName;
+        break;
     }
+    if (sthConfig.SHOULD_HASH) {
+      var limit = getHashSizeInBytes(databaseName);
+      if (limit < MIN_HASH_SIZE_IN_BYTES) {
+        sthLogger.warn('The available bytes for the hashes to be used as part of the collection names is not big enough (' +
+          'at least ' + MIN_HASH_SIZE_IN_BYTES + ' bytes are needed), ' +
+          'please reduce the size of the DB_PREFIX ("' + sthConfig.DB_PREFIX + '" = ' + bytesCounter.count(sthConfig.DB_PREFIX) + ' bytes), ' +
+          'the service ("' + databaseName.substring(sthConfig.DB_PREFIX.length, databaseName.length) + '" = ' + bytesCounter.count(databaseName) +
+          ' bytes) and/or the COLLECTION_PREFIX ("' + sthConfig.COLLECTION_PREFIX + '" = ' + bytesCounter.count(sthConfig.COLLECTION_PREFIX) +
+          ' bytes)',
+          {
+            operationType: sthConfig.OPERATION_TYPE.DB_LOG
+          }
+        );
+        return null;
+      }
+      return sthConfig.COLLECTION_PREFIX + generateHash(collectionName4Events, limit);
+    } else {
+      return sthConfig.COLLECTION_PREFIX + collectionName4Events;
+    }
+  }
+
+  /**
+   * Returns the available hash size in bytes to be used as part of the collection names
+   *  based on the database name, database name prefix and collection name prefix
+   * @param databaseName The database name
+   * @return {number} The size of the hash in bytes
+   */
+  function getHashSizeInBytes(databaseName) {
+    return MAX_NAMESPACE_SIZE_IN_BYTES - bytesCounter.count(databaseName) -
+      bytesCounter.count(sthConfig.COLLECTION_PREFIX) - bytesCounter.count('.aggr') - 1;
   }
 
   /**
    * Return the name of the collection which will store the aggregated data
+   * @param {string} databaseName The database name
    * @param {string} servicePath The service path of the entity the event is related to
    * @param {string} entityId The entity id related to the event
    * @param {string} entityType The type of entity related to the event
    * @param {string} attrName The attribute id related to the event
    * @returns {string} The collection name
    */
-  function getCollectionName4Aggregated(servicePath, entityId, entityType,
+  function getCollectionName4Aggregated(databaseName, servicePath, entityId, entityType,
                                         attrName) {
-    return getCollectionName4Events(
-        servicePath, entityId, entityType, attrName) + '.aggr';
+    var collectionName4Events = getCollectionName4Events(
+      databaseName, servicePath, entityId, entityType, attrName);
+    if (collectionName4Events) {
+      return collectionName4Events + '.aggr';
+    } else {
+      return null;
+    }
   }
 
   /**
    * Returns a reference to a collection of the database asynchronously
-   * @param {string} databaseName The database's name
-   * @param {string} collectionName The collection's name
+   * @param {object} params Params (service, service path, entity, attribute or collection) for which the collection
+   *  should be returned
+   * @param {boolean} isAggregated Flag indicating if the aggregated collection is desired. If false, the raw data
+   *  collection is the one requiered
    * @param {boolean} shouldCreate Flag indicating if the collection should be created
    *  if it does not exist
+   * @param {boolean} shouldStoreHash Flag indicating if the collection hash should be stored in case the collection
+   *  is created
    * @param {Function} callback Callback function to be called with the results
    */
-  function getCollection(databaseName, collectionName, shouldCreate, callback) {
+  function getCollection(params, isAggregated, shouldCreate, shouldStoreHash, callback) {
+    var databaseName = getDatabase(params.service);
+
+    shouldStoreHash = sthConfig.SHOULD_HASH && shouldStoreHash;
+
+    var collectionName;
+    if (params.collection) {
+      collectionName = params.collection;
+    } else {
+      collectionName = isAggregated ?
+        getCollectionName4Aggregated(databaseName, params.servicePath, params.entityId, params.entityType,
+          params.attrName) :
+        getCollectionName4Events(databaseName, params.servicePath, params.entityId, params.entityType,
+          params.attrName);
+    }
+
+    if (!collectionName) {
+      var error = boom.badRequest('The collection name could not be generated');
+      return process.nextTick(callback.bind(null, error));
+    }
+
     // Switch to the right database
     var connection = mongoose.connection.useDb(databaseName);
 
@@ -226,6 +297,17 @@
           shouldCreate) {
           connection.db.createCollection(collectionName,
             function (err, collection) {
+              if (!err && shouldStoreHash) {
+                storeCollectionHash(params, isAggregated, collectionName, function(err) {
+                  if (err) {
+                    // There was an error when storing the collection hash
+                    // Do nothing
+                    sthLogger.warn('Error when storing the hash generated as part of the collection name into the database', {
+                      operationType: sthConfig.OPERATION_TYPE.DB_LOG
+                    });
+                  }
+                });
+              }
               if (err && err.message === 'collection already exists') {
                 // We have observed that although leaving the strict option to the default value, sometimes
                 //  we get a 'collection already exists' error when executing connection.db#createCollection()
@@ -809,6 +891,84 @@
     });
   }
 
+  /**
+   * Generates a hash based on an input and a maximum number of bytes
+   * @param input The input to generate the hash from
+   * @param limit The maximum number of bytes of the hash
+   */
+  function generateHash(input, limit) {
+    var shasum = crypto.createHash('sha512');
+    shasum.update(input);
+    var hash = shasum.digest('hex');
+    if (limit) {
+      hash = hash.substr(0, limit);
+    }
+    return hash;
+  }
+
+  /**
+   * Stores the collection name (hash) in the database
+   * @param params The params used to generate the collection name (hash)
+   * @param hash The generated hash used as part of the collection names
+   * @param callback A callback function
+   */
+  function storeCollectionHash(params, isAggregated, hash, callback) {
+    getCollection({
+      service: params.service,
+      collection: sthConfig.COLLECTION_PREFIX + 'collection_names'
+    }, false, true, false, function(err, collection) {
+      if (err) {
+        return callback(err);
+      }
+      // 2 updates operations are needed since MongoDB currently does not support the possibility
+      //  to address the same field in a $set operation as a $setOnInsert operation
+      collection.update(
+        {
+          _id: hash
+        },
+        {
+          '$setOnInsert': {
+            dataModel: sthConfig.DATA_MODEL,
+            isAggregated: isAggregated,
+            service: params.service,
+            servicePath: params.servicePath,
+            entityId: params.entityId,
+            entityType: params.entityType,
+            attrName: params.attrName
+          }
+        }, {
+          upsert: true
+        },
+        function(err) {
+          if (err && callback) {
+            return callback(err);
+          }
+          collection.update(
+            {
+              _id: hash
+            },
+            {
+              '$set': {
+                dataModel: sthConfig.DATA_MODEL,
+                isAggregated: isAggregated,
+                service: params.service,
+                servicePath: params.servicePath,
+                entityId: params.entityId,
+                entityType: params.entityType,
+                attrName: params.attrName
+              }
+            },
+            function(err) {
+              if (callback) {
+                return callback(err);
+              }
+            }
+          );
+        }
+      );
+    });
+  }
+
   module.exports = function (theSthConfig, theSthLogger, theSthHelper) {
     sthConfig = theSthConfig;
     sthLogger = theSthLogger;