# Serialization

The term *serialization* refers to storing results between executions of the code.  The standard way to do this in python is by using the `pickle` module, but there are a number of disadvantantages to this in the distributed scientific environment we're operating in here.

We really have four major choices here:
 - a relational database via `sqlalchemy`
 - a key-value store like MongoDB
 - HDF5 (which is a standard way of storing scientific/numerical data
 - some hybrid combination of the above

My current feeling is that I should use SQL for the metadata (for searchability and parallelism) and HDF5 for the actual bulky numerical bits (for portability and performance). Of course, if I'm making any claims about performance, I really ought to run tests to verify my intuition.

Let's start with the metadata.

## An ORM for the flow object

In [1]:
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base

BASE = declarative_base()

class FlowORM(BASE):
    """The ORM class corresponding to the flow class defined below."""
    __tablename__ = 'flows'

    id = Column(Integer, primary_key=True)
    nx = Column(Integer)
    ny = Column(Integer)
    lx = Column(Float)
    ly = Column(Float)
    beta = Column(Float)
    kappa = Column(Float)
    w = Column(Float)
    method = Column(String)
    dt = Column(Float)
    initial_amp = Column(Float)
    seed = Column(Integer)
    resultsfile = Column(String(50))
    
    def __repr__(self):
        return "<Flow {}>".format(self.id)

In [2]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:', echo=True)

In [3]:
import sys
sys.path.append('../')
from layers.layers import Flow

In [4]:
foo = Flow()

In [5]:
bar = foo.serialize()

In [6]:
bar

<Flow None>

In [7]:
bar.seed

471913919

In [8]:
FlowORM.__table__

Table('flows', MetaData(bind=None), Column('id', Integer(), table=<flows>, primary_key=True, nullable=False), Column('nx', Integer(), table=<flows>), Column('ny', Integer(), table=<flows>), Column('lx', Float(), table=<flows>), Column('ly', Float(), table=<flows>), Column('beta', Float(), table=<flows>), Column('kappa', Float(), table=<flows>), Column('w', Float(), table=<flows>), Column('method', String(), table=<flows>), Column('dt', Float(), table=<flows>), Column('initial_amp', Float(), table=<flows>), Column('seed', Integer(), table=<flows>), Column('resultsfile', String(length=50), table=<flows>), schema=None)

In [9]:
FlowORM.metadata.create_all(engine)

2017-03-06 21:42:24,878 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2017-03-06 21:42:24,879 INFO sqlalchemy.engine.base.Engine ()
2017-03-06 21:42:24,880 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2017-03-06 21:42:24,881 INFO sqlalchemy.engine.base.Engine ()
2017-03-06 21:42:24,882 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("flows")
2017-03-06 21:42:24,883 INFO sqlalchemy.engine.base.Engine ()
2017-03-06 21:42:24,885 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE flows (
	id INTEGER NOT NULL, 
	nx INTEGER, 
	ny INTEGER, 
	lx FLOAT, 
	ly FLOAT, 
	beta FLOAT, 
	kappa FLOAT, 
	w FLOAT, 
	method VARCHAR, 
	dt FLOAT, 
	initial_amp FLOAT, 
	seed INTEGER, 
	resultsfile VARCHAR(50), 
	PRIMARY KEY (id)
)


2017-03-06 21:42:24,886 INFO sqlalchemy.engine.base.Engine ()
2017-03-06 21:42:24,887 INFO sqlalchemy.engine.base.Engine COMMIT


In [10]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

In [11]:
session.add(bar)

In [12]:
session.commit()

2017-03-06 21:42:24,901 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2017-03-06 21:42:24,904 INFO sqlalchemy.engine.base.Engine INSERT INTO flows (nx, ny, lx, ly, beta, kappa, w, method, dt, initial_amp, seed, resultsfile) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
2017-03-06 21:42:24,905 INFO sqlalchemy.engine.base.Engine (64, 64, 1000000.0, 1000000.0, 6e-10, 10.0, 4e-06, 'dopri5', 7200.0, 120000.0, 471913919, None)
2017-03-06 21:42:24,906 INFO sqlalchemy.engine.base.Engine COMMIT


In [13]:
bar

2017-03-06 21:46:31,474 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2017-03-06 21:46:31,477 INFO sqlalchemy.engine.base.Engine SELECT flows.id AS flows_id, flows.nx AS flows_nx, flows.ny AS flows_ny, flows.lx AS flows_lx, flows.ly AS flows_ly, flows.beta AS flows_beta, flows.kappa AS flows_kappa, flows.w AS flows_w, flows.method AS flows_method, flows.dt AS flows_dt, flows.initial_amp AS flows_initial_amp, flows.seed AS flows_seed, flows.resultsfile AS flows_resultsfile 
FROM flows 
WHERE flows.id = ?
2017-03-06 21:46:31,478 INFO sqlalchemy.engine.base.Engine (1,)


<Flow 1>

In [14]:
foo.integrate(24*3600)

In [15]:
import h5py

In [16]:
foo.resultsfile

'flow-4e-06-10.0-471913919.h5'

In [17]:
f = h5py.File(foo.resultsfile, "w")

In [18]:
foo.results[0]

array([[ 0.+0.j, -0.+0.j,  0.+0.j, ..., -0.+0.j,  0.+0.j,  0.+0.j],
       [-0.+0.j,  0.+0.j,  0.+0.j, ...,  0.+0.j,  0.+0.j,  0.+0.j],
       [ 0.+0.j,  0.+0.j,  0.+0.j, ..., -0.+0.j,  0.+0.j,  0.+0.j],
       ..., 
       [ 0.+0.j,  0.+0.j,  0.+0.j, ...,  0.+0.j, -0.+0.j, -0.+0.j],
       [-0.+0.j,  0.+0.j,  0.+0.j, ...,  0.+0.j,  0.+0.j,  0.+0.j],
       [ 0.+0.j, -0.+0.j,  0.+0.j, ...,  0.+0.j, -0.+0.j,  0.+0.j]])

In [23]:
f.create_dataset('qhat', data=np.array(foo.results))

<HDF5 dataset "qhat": shape (13, 64, 33), type "<c16">

In [20]:
import numpy as np

In [30]:
list(f.keys())

['qhat']

In [32]:
foo.qhat.shape[1]

33

In [33]:
f.close()

In [35]:
f.file

RuntimeError: Can't retrieve file id (Invalid object id)