Merge e64be38 into ec47cfb

terrapower · Jan 16, 2020 · 8c20129 · 8c20129
2 parents ec47cfb + e64be38
commit 8c20129
Show file tree

Hide file tree

Showing 3 changed files with 146 additions and 26 deletions.
diff --git a/armi/bookkeeping/db/__init__.py b/armi/bookkeeping/db/__init__.py
@@ -20,7 +20,35 @@
 
 The database can be visualized through various tools such as XTVIEW.
 
-This module contains factories for selecting and building DB-related objects
+This module contains factories for selecting and building DB-related objects.
+
+Some notes on versions
+----------------------
+Persistent storage of ARMI models has seen many changes throughout the years.
+Calculation results were originally stored on a SQL database (version 1), which has been
+fully deprecated at this point.
+
+Version 2 was the first to use HDF5 as the primary storage format. This was beneficial,
+as it did not rely on any external infrastructure to operate, and benefited from the
+suite of tools that help interact with HDF5 files. It was eventually replaced because
+it did not store a complete model of the reactor, but rather a ghost of assembly, block,
+and reactor parameters that could be applied to an existing reactor model (so long as
+the dimensions were consistent!). This led to loading reactors being inconvenient and
+error-prone, and also posed a limitation for representing more complex systems that have
+non-core components.
+
+Version 3 was created to make the schema more flexible and to permit storing the entire
+reactor model within the HDF5 file. All objects in the ARMI Composite Model are written
+to the database, and the model can be recovered in its entirety just from the HDF5 file.
+Since it's inception, it has seen a number of tweaks to improve its functionality and
+fix bugs.
+
+Being a serialization format, the code associated with reading and writing database
+files may not benefit from Don't Repeat Yourself (DRY) practices in the same way as
+other code. Therefore, we do not share much code between different major versions of the
+databases. Minor revisions (e.g. M.(N+1)) to the database structure should be simple
+enough that specialized logic can be used to support all minor versions without posing a
+maintenance burden. A detailed change log should be maintained of each minor revision.
 """
 import os
 from typing import Optional, List, Tuple

diff --git a/armi/bookkeeping/db/database3.py b/armi/bookkeeping/db/database3.py
@@ -32,6 +32,18 @@
 for a given object or collection of object from the database file. When interacting with
 the database file, the ``Layout`` class is used to help map the hierarchical Composite
 Reactor Model to the flat representation in the database.
+
+Minor revision changelog
+------------------------
+ - 3.1: Improve the handling of reading/writing grids.
+
+ - 3.2: Change the strategy for storing large attributes from using an Object Reference
+   to an external dataset to using a special string starting with an "@" symbol (e.g.,
+   "@/c00n00/attrs/5_linkedDims"). This was done to support copying time node datasets
+   from one file to another without invalidating the references. Support is maintained
+   for reading previous versions, and for performing a ``mergeHistory()`` and converting
+   to the new reference strategy, but the old version cannot be written.
+
 """
 import collections
 import copy
@@ -77,7 +89,9 @@
 from armi.utils.textProcessors import resolveMarkupInclusions
 
 ORDER = interfaces.STACK_ORDER.BOOKKEEPING
-DB_VERSION = "3.1"
+DB_VERSION = "3.2"
+
+ATTR_LINK = re.compile("^@(.*)$")
 
 
 def getH5GroupName(cycle, timeNode, statePointName=None):
@@ -384,8 +398,6 @@ class Database3(database.Database):
     `doc/user/outputs/database` for more details.
     """
 
-    version = DB_VERSION
-
     timeNodeGroupPattern = re.compile(r"^c(\d\d)n(\d\d)$")
 
     def __init__(self, fileName: str, permission: str):
@@ -413,6 +425,31 @@ def __init__(self, fileName: str, permission: str):
         # closed.
         self._openCount: int = 0
 
+        if permission == "w":
+            self.version = DB_VERSION
+        else:
+            # will be set upon read
+            self._version = None
+            self._versionMajor = None
+            self._versionMinor = None
+
+    @property
+    def version(self):
+        return self._version
+
+    @version.setter
+    def version(self, value):
+        self._version = value
+        self._versionMajor, self._versionMinor = (int(v) for v in value.split("."))
+
+    @property
+    def versionMajor(self):
+        return self._versionMajor
+
+    @property
+    def versionMinor(self):
+        return self._versionMinor
+
     def __repr__(self):
         return "<{} {}>".format(
             self.__class__.__name__, repr(self.h5db).replace("<", "").replace(">", "")
@@ -430,6 +467,7 @@ def open(self):
         if self._permission in {"r", "a"}:
             self._fullPath = os.path.abspath(filePath)
             self.h5db = h5py.File(filePath, self._permission)
+            self.version = self.h5db.attrs["databaseVersion"]
             return
 
         if self._permission == "w":
@@ -447,7 +485,7 @@ def open(self):
         self.h5db = h5py.File(filePath, self._permission)
         self.h5db.attrs["successfulCompletion"] = False
         self.h5db.attrs["version"] = armi.__version__
-        self.h5db.attrs["databaseVersion"] = DB_VERSION
+        self.h5db.attrs["databaseVersion"] = self.version
         self.h5db.attrs["user"] = armi.USER
         self.h5db.attrs["python"] = sys.version
         self.h5db.attrs["armiLocation"] = os.path.dirname(armi.ROOT)
@@ -637,6 +675,20 @@ def mergeHistory(self, inputDB, startCycle, startNode):
                 return
             self.h5db.copy(h5ts, h5ts.name)
 
+            if inputDB.versionMinor < 2:
+                # The source database may have object references in some attributes.
+                # make sure to link those up using our manual path strategy.
+                references = []
+                def findReferences(name, obj):
+                    for key, attr in obj.attrs.items():
+                        if isinstance(attr, h5py.h5r.Reference):
+                            references.append((name, key, inputDB.h5db[attr].name))
+                h5ts.visititems(findReferences)
+
+                for key, attr, path in references:
+                    destTs = self.h5db[h5ts.name]
+                    destTs[key].attrs[attr] = "@{}".format(path)
+
     def __enter__(self):
         """Context management support"""
         if self._openCount == 0:
@@ -1976,11 +2028,10 @@ def _writeAttrs(obj, group, attrs):
     In such cases, this will store the attribute data in a proper dataset and
     place a reference to that dataset in the attribute instead.
 
-    In practice, this takes ``linkedDims`` attrs from a particular
-    component type (like ``c00/n00/Circle/id``) and stores them
-    in new datasets (like ``c00n00/attrs/1_linkedDims``,
-    ``c00n00/attrs/2_linkedDims``) and
-    then sets the object's attrs to links to those datasets.
+    In practice, this takes ``linkedDims`` attrs from a particular component type (like
+    ``c00n00/Circle/id``) and stores them in new datasets (like
+    ``c00n00/attrs/1_linkedDims``, ``c00n00/attrs/2_linkedDims``) and then sets the
+    object's attrs to links to those datasets.
     """
     for key, value in attrs.items():
         try:
@@ -2001,7 +2052,10 @@ def _writeAttrs(obj, group, attrs):
             dataName = str(len(attrGroup)) + "_" + key
             attrGroup[dataName] = value
 
-            obj.attrs[key] = attrGroup[dataName].ref
+            # using a soft link here allows us to cheaply copy time nodes without
+            # needing to crawl through and update object references.
+            linkName = attrGroup[dataName].name
+            obj.attrs[key] = "@{}".format(linkName)
 
 
 def _resolveAttrs(attrs, group):
@@ -2015,9 +2069,17 @@ def _resolveAttrs(attrs, group):
     for key, val in attrs.items():
         try:
             if isinstance(val, h5py.h5r.Reference):
-                # dereference the .ref to get the data
-                # out of the dataset.
+                # Old style object reference. If this cannot be dereferenced, it is
+                # likely because mergeHistory was used to get the current database,
+                # which does not preserve references.
                 resolved[key] = group[val]
+            elif isinstance(val, str):
+                m = ATTR_LINK.match(val)
+                if m:
+                    # dereference the path to get the data out of the dataset.
+                    resolved[key] = group[m.group(1)][()]
+                else:
+                    resolved[key] = val
             else:
                 resolved[key] = val
         except ValueError:

diff --git a/armi/bookkeeping/db/tests/test_database3.py b/armi/bookkeeping/db/tests/test_database3.py
@@ -26,20 +26,29 @@
 
 
 class TestDatabase3(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.o, cls.r = test_reactors.loadTestReactor(TEST_ROOT)
-
     def setUp(self):
+        self.o, self.r = test_reactors.loadTestReactor(TEST_ROOT)
         self.db = database.Database3(self._testMethodName + ".h5", "w")
         self.db.open()
-        print(self.db._fullPath)
         self.stateRetainer = self.r.retainState().__enter__()
 
     def tearDown(self):
         self.db.close()
         self.stateRetainer.__exit__()
 
+    def makeHistory(self):
+        """
+        Walk the reactor through a few time steps and write them to the db.
+        """
+        for cycle, node in ((cycle, node) for cycle in range(3) for node in range(3)):
+            self.r.p.cycle = cycle
+            self.r.p.timeNode = node
+            # something that splitDatabase won't change, so that we can make sure that
+            # the right data went to the right new groups/cycles
+            self.r.p.cycleLength = cycle
+
+            self.db.writeToDB(self.r)
+
     def _compareArrays(self, ref, src):
         """
         Compare two numpy arrays.
@@ -94,15 +103,36 @@ def test_replaceNones(self):
         self._compareRoundTrip(dataJagNones)
         self._compareRoundTrip(dataDict)
 
-    def test_splitDatabase(self):
-        for cycle, node in ((cycle, node) for cycle in range(3) for node in range(3)):
-            self.r.p.cycle = cycle
-            self.r.p.timeNode = node
-            # something that splitDatabase won't change, so that we can make sure that
-            # the right data went to the right new groups/cycles
-            self.r.p.cycleLength = cycle
+    def test_mergeHistory(self):
+        self.makeHistory()
+
+        # put some big data in an HDF5 attribute. This will exercise the code that pulls
+        # such attributes into a formal dataset and a reference.
+        self.r.p.cycle = 1
+        self.r.p.timeNode = 0
+        tnGroup = self.db.getH5Group(self.r)
+        database._writeAttrs(
+            tnGroup["layout/serialNum"], tnGroup, {"fakeBigData": numpy.eye(6400),
+                "someString": "this isn't a reference to another dataset"}
+        )
 
-            self.db.writeToDB(self.r)
+        db2 = database.Database3("restartDB.h5", "w")
+        with db2:
+            db2.mergeHistory(self.db, 2, 2)
+            self.r.p.cycle = 1
+            self.r.p.timeNode = 0
+            tnGroup = db2.getH5Group(self.r)
+
+            # this test is a little bit implementation-specific, but nice to be explicit
+            self.assertEqual(tnGroup["layout/serialNum"].attrs["fakeBigData"],
+                    "@/c01n00/attrs/0_fakeBigData")
+
+            # actually exercise the _resolveAttrs function
+            attrs = database._resolveAttrs(tnGroup["layout/serialNum"].attrs, tnGroup)
+            self.assertTrue(numpy.array_equal(attrs["fakeBigData"], numpy.eye(6400)))
+
+    def test_splitDatabase(self):
+        self.makeHistory()
 
         self.db.splitDatabase(
             [(c, n) for c in (1, 2) for n in range(3)], "-all-iterations"