In [54]:
import sys
sys.path.append('./..')
import data as tutorial_data
file_name = tutorial_data.get_file('BYxRM')

# Introduction to HDF5 in Python

based on [Giacomo Debidda's notebook](https://github.com/jackdbd/hdf5-pydata-munich/blob/master/hdf5_in_python.ipynb)


## HDF5: a filesystem in a file

**HDF** stands for *Hierarchical Data Format* and is a data model, library, and file format for storing and managing big and complex data.

An HDF5 file can be thought of as a container (or group) that holds a variety of heterogeneous data objects (or datasets). The datasets can be almost anything: images, tables, graphs, or even documents, such as PDF or Excel.

- Datasets (i.e. files in a filesystem)
- Groups (i.e. directories in a filesystem)
- Attributes (i.e. metadata of file/directory)


![hdf5_structure](images/hdf5_structure.jpg "HDF5 structure")

Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with `/`-separators.  
The “folders” in this system are called groups. The **File** object we created is itself a *group*, in this case the *root* group, named `/`  

**Groups work like dictionaries, and datasets work like NumPy arrays**

**/** root group (every HDF5 file has a root group)

**/foo** member of the root group called foo

**/foo/bar** member of the group foo called bar

## HDF5 in the Python data stack

![hdf5_in_python](images/h5py-pytables-refactor.png "HDF5 in Python data stack")



## h5py

The h5py package is a Pythonic interface to the HDF5 binary data format.

- Thin, pythonic wrapper around the HDF5 C API
- Written in Cython
- Tries to expose the entire HDF5 C API

In [55]:
import os
import numpy as np
import h5py

### Dataset

In [56]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    f.create_dataset(name='my_dataset', data=[1.0, 2.7, 3.7, 4.5])
    f.create_dataset(name='another_dataset', data=[1, 2, 3, 4])
    f.create_dataset(name='yet_another_dataset', data=[1, 2, 3, 4], dtype=np.float32)

In [57]:
with h5py.File(name='data/my_h5py_file.h5', mode='r') as f:
    # the array is just a proxy object
    print(f['my_dataset'])
    # the actual data can be accessed with these 2 syntaxes
    print(f['my_dataset'][:])
    print(f['my_dataset'][...])

<HDF5 dataset "my_dataset": shape (4,), type "<f8">
[1.  2.7 3.7 4.5]
[1.  2.7 3.7 4.5]


### Preallocation on disk

In [58]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    dataset = f.create_dataset(name='my_dataset', shape=(8, 1))
    dataset[0] = 5.2
    dataset[1] = 7

In [59]:
with h5py.File(name='data/my_h5py_file.h5', mode='a') as f:
    dataset = f['my_dataset']
    dataset[2] = 3.9
    dataset[3] = 8.3

### Pick the correct HDF5 datatype

In [60]:
arr = np.array([0, 1, 254, 255, 256, -1, -2], dtype='uint8')
print(arr)

[  0   1 254 255   0 255 254]


In [61]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    f.create_dataset(name='my_dataset', shape=(7,), dtype=h5py.h5t.STD_U8BE)
    f['my_dataset'][0:8] = [0, 1, 254, 255, 123456, -1, -2]
    print(f["my_dataset"][:])

[  0   1 254 255 255   0   0]


### Groups

In [62]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    f.create_group(name='group1')
    group2 = f.create_group(name='group2')
    group2.create_group(name='group3')

In [63]:
with h5py.File(name='data/my_h5py_file.h5', mode='r') as f:
    group3 = f['group2/group3']
    print(group3.parent)

<HDF5 group "/group2" (1 members)>


### Attributes

In [64]:
with h5py.File(name='data/my_h5py_file.h5', mode='w') as f:
    dataset = f.create_dataset(name='my_dataset', data=[1, 2, 3, 4])
    dataset.attrs['Unit'] = 4
    gr = f.create_group(name='my_group')
    gr.attrs['Created'] = '18/12/2017'
    gr.attrs.create(name='Versions', data=np.array([123, 456]))

### Traverse a HDF5 file with h5py

In [65]:
with h5py.File(name='data/my_h5py_file.h5', mode='r') as f:
    f.visit(print)

my_dataset
my_group


## HDF5 Command Line Tools

[Here](https://support.hdfgroup.org/products/hdf5_tools/#h5dist) you can find the command line tools developed by the HDF Group. You don't need h5py or PyTables to use them.

If you are on Ubuntu, you can install them with `sudo apt install hdf5-tools` but they are also installed when you install `h5py` or `PyTables` with `conda`.

In [66]:
# -r stands for 'recursive' 
!h5ls -r 'data/my_h5py_file.h5'

/                        Group
/my_dataset              Dataset {4}
/my_group                Group


In [67]:
!h5dump 'data/my_h5py_file.h5'

HDF5 "data/my_h5py_file.h5" {
GROUP "/" {
   DATASET "my_dataset" {
      DATATYPE  H5T_STD_I64LE
      DATASPACE  SIMPLE { ( 4 ) / ( 4 ) }
      DATA {
      (0): 1, 2, 3, 4
      }
      ATTRIBUTE "Unit" {
         DATATYPE  H5T_STD_I64LE
         DATASPACE  SCALAR
         DATA {
         (0): 4
         }
      }
   }
   GROUP "my_group" {
      ATTRIBUTE "Created" {
         DATATYPE  H5T_STRING {
            STRSIZE H5T_VARIABLE;
            STRPAD H5T_STR_NULLTERM;
            CSET H5T_CSET_UTF8;
            CTYPE H5T_C_S1;
         }
         DATASPACE  SCALAR
         DATA {
         (0): "18/12/2017"
         }
      }
      ATTRIBUTE "Versions" {
         DATATYPE  H5T_STD_I64LE
         DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
         DATA {
         (0): 123, 456
         }
      }
   }
}
}


### Reading existing HDF5 file

In [68]:
f = h5py.File(file_name, 'r')

In [69]:
list(f.keys())

['genotype', 'phenotype']

The root object contains two children (*genotype* and *phenotype*).  
To find out what type (dataset or group) of object the *phenotype* is we can run: 

In [70]:
f['phenotype']

<HDF5 group "/phenotype" (3 members)>

It looks like the *phenotype* object itself is a group with 3 children

In [71]:
list(f['phenotype'].keys())

['col_header', 'matrix', 'row_header']

An easy way to iterate over all elements of a HDF5 file is to use the visit method:

In [72]:
def printname(name):
    print("%s: %s " % (name, f[name]))
f.visit(printname)

genotype: <HDF5 group "/genotype" (3 members)> 
genotype/col_header: <HDF5 group "/genotype/col_header" (3 members)> 
genotype/col_header/alleles: <HDF5 dataset "alleles": shape (11623, 2), type "|S37"> 
genotype/col_header/chrom: <HDF5 dataset "chrom": shape (11623,), type "<i8"> 
genotype/col_header/pos: <HDF5 dataset "pos": shape (11623,), type "<i8"> 
genotype/matrix: <HDF5 dataset "matrix": shape (1008, 11623), type "|u1"> 
genotype/row_header: <HDF5 group "/genotype/row_header" (1 members)> 
genotype/row_header/sample_ID: <HDF5 dataset "sample_ID": shape (1008,), type "|S99"> 
phenotype: <HDF5 group "/phenotype" (3 members)> 
phenotype/col_header: <HDF5 group "/phenotype/col_header" (1 members)> 
phenotype/col_header/phenotype_ID: <HDF5 dataset "phenotype_ID": shape (46,), type "|S22"> 
phenotype/matrix: <HDF5 dataset "matrix": shape (1008, 46), type "<f8"> 
phenotype/row_header: <HDF5 group "/phenotype/row_header" (1 members)> 
phenotype/row_header/sample_ID: <HDF5 dataset "samp

Every object in an HDF5 file can have custom attributes which can be checked like any other dictionary:

In [73]:
list(f['phenotype']['matrix'].attrs.keys())

[]

Let us examine the *phenotype/matrix* dataset as a `Dataset` object.   
The object we obtained isn’t an array, but an *HDF5* dataset. Like *NumPy* arrays, datasets have both a *shape* and a *data type*:

In [74]:
dset = f['phenotype/matrix']
print('shape: (%s,%s)' % dset.shape)
print('dtype: %s' % dset.dtype)

shape: (1008,46)
dtype: float64


They also support array-style slicing. This is how you read and write data from a dataset in the file. 

In [75]:
# retrieve all rows between 10 and 20 for the the second column
dset[10:20, 2]

array([ 2.24597261,  2.6897812 , -0.46048117,  0.03086018, -0.29291799,
       -1.09117648,  0.70117087, -0.86986989, -0.13682744, -0.28441332])

When filtering/slicing/indexing rows and columns of a dataset, not the entire dataset is loaded into the main memory. This is very useful if you need to retrieve a subset of the data from a terabyte big HDF5 file.  
To read the entire dataset into the memory run:

In [76]:
dset[:]

array([[-7.32351971e+00,  2.79992827e-01,  3.13118166e-01, ...,
         8.90841948e-01,  4.11837231e+00,  8.59281836e+00],
       [-8.09823582e+00, -2.06326076e-01, -5.34843783e-01, ...,
         6.06164117e-03,  6.65150029e-02, -4.22047646e+00],
       [ 7.60571968e+00, -1.27959825e-01, -3.11102424e-01, ...,
         1.72315729e+00,  5.71408803e+00, -6.50651895e+00],
       ...,
       [            nan,  3.21844466e-01,             nan, ...,
         1.80142187e+00,             nan, -4.99069190e-01],
       [            nan, -1.00150686e+00,             nan, ...,
        -1.74752575e+00,             nan, -7.43596147e+00],
       [            nan,             nan,             nan, ...,
                    nan,             nan,             nan]])

In [77]:
arr = dset[:]
print(type(arr))
arr

<class 'numpy.ndarray'>


array([[-7.32351971e+00,  2.79992827e-01,  3.13118166e-01, ...,
         8.90841948e-01,  4.11837231e+00,  8.59281836e+00],
       [-8.09823582e+00, -2.06326076e-01, -5.34843783e-01, ...,
         6.06164117e-03,  6.65150029e-02, -4.22047646e+00],
       [ 7.60571968e+00, -1.27959825e-01, -3.11102424e-01, ...,
         1.72315729e+00,  5.71408803e+00, -6.50651895e+00],
       ...,
       [            nan,  3.21844466e-01,             nan, ...,
         1.80142187e+00,             nan, -4.99069190e-01],
       [            nan, -1.00150686e+00,             nan, ...,
        -1.74752575e+00,             nan, -7.43596147e+00],
       [            nan,             nan,             nan, ...,
                    nan,             nan,             nan]])

### Excercise: Create h5py file

<div class="alert alert-success">
Write a function, <code>create_h5py_hdf5</code>, which takes an array of `participants` and their `reaction_times` as parameters and stores them in an HDF5 file using the h5py library. 
</div>

In [None]:
participants = np.load("data/experiment_participants.npy")
reaction_times = np.load("data/experiment_data.npy")

In [None]:
def create_h5py_hdf5(participants, reaction_times):
    """Create a HDF5 file using h5py containing a dataset reaction_times and the participants as an attribute.
       Make sure to:

      * Store reaction_times as a datasets inside /experiments
      * Datasets reaction_times should be compressed with zlib (5)
      * Return the filename of the generated HDF5 file
      * Use float32 as a data type for the reaction_times
      * Store the mean reaction times as an attribute 'ameanvg' on the reaction_times dataset
      * Sore the participants array as an attriubute "participants" on the /experiments group

    Parameters
    ----------
    participants : numpy array with shape (n,),
    reaction_times : numpy array with shape (n,)

    """
    # YOUR CODE HERE
    raise NotImplementedError()
    # YOUR CODE HERE
    return file_name

In [None]:
import os
from numpy.testing import assert_array_equal, assert_almost_equal
file_name = create_h5py_hdf5(participants,reaction_times)
assert os.path.exists(file_name)
with h5py.File(file_name, 'r') as hf:
    assert len(hf.keys()) == 1
    assert len(hf['experiments'].keys()) == 1
    assert_array_equal(participants, np.char.decode(hf['experiments'].attrs['participants'][:]))
    assert_almost_equal(reaction_times, hf['experiments']['reaction_times'][:],decimal=3)
    assert hf['experiments']['reaction_times'].compression == 'gzip'
    assert hf['experiments']['reaction_times'].compression_opts == 5
    print("Success!")

## PyTables

- Higher abstraction over HDF5 (it's more "battery included")
- Searches are faster than in h5py
- Does not depend on h5py (at the moment)

![pytables logo](images/pytables-logo.png)

In [78]:
import tables as tb

### Array

In [79]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_array(where='/', 
                   name='my_array',
                   title='My PyTables Array',
                   obj=[1, 2, 3, 4])

PyTables has a feature called "Natural Naming": nodes (i.e. datasets and groups in the HDF5 file) can be accessed with the dot notation.

In [80]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='r') as f:
    print(f.root.my_array)

/my_array (Array(4,)) 'My PyTables Array'


### Groups

In [81]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_group(where='/', name='my_group')

### Attributes

In [82]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_array(where=f.root, name='my_array', obj=[1, 2, 3, 4], title='My PyTables Array')
    f.set_node_attr(where='/my_array', attrname='SomeAttribute', attrvalue='SomeValue')
    f.create_group(where='/', name='my_group')
    f.set_node_attr(where='/my_group', attrname='SomeOtherAttribute', attrvalue=123)

### HDF5 datasets have many abstractions in PyTables

Homogenous dataset:

- Array
- CArray
- EArray
- VLArray

Heterogenous dataset:

- Table

In [83]:
num_rows = 1000000  # 1 million
gaussian = np.random.normal(loc=0, scale=1, size=num_rows).astype('float32')
uniform = np.random.uniform(low=100, high=150, size=num_rows).astype('uint8')
num_columns = 5
matrix = np.random.random((num_rows, num_columns)).astype('float32')

### Array (again!)
[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#the-array-class)

- Fastest I/O speed
- Must fit in memory
- Not compressible
- Not enlargeable

In [84]:
%%time
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    f.create_array(where='/', name='gaussian', obj=gaussian)
    f.create_array(where='/', name='uniform', obj=uniform)
    f.create_array(where='/', name='matrix', obj=matrix)

CPU times: user 3.26 ms, sys: 9.18 ms, total: 12.4 ms
Wall time: 14.3 ms


In [85]:
!ls -lh data/my_pytables_file.h5

-rw-r--r--  1 uemit.seren  staff    24M Nov 14 18:22 data/my_pytables_file.h5


### CArray
[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#carrayclassdescr)

- Compressible
- Not enlargeable

In [86]:
filters = tb.Filters(complevel=5, complib='zlib')

Tips on how to use compression (from the PyTables docs)

- A mid-level (5) compression is sufficient. No need to go all the way up (9)
- Use zlib if you must guarantee complete portability
- Use blosc all other times (it is optimized for HDF5)

In [87]:
%%time
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
#     f.create_carray(where='/', name='gaussian', obj=gaussian)
#     f.create_carray(where='/', name='uniform', obj=uniform)
#     f.create_carray(where='/', name='matrix', obj=matrix)
    f.create_carray(where='/', name='gaussian', obj=gaussian, filters=filters)
    f.create_carray(where='/', name='uniform', obj=uniform, filters=filters)
    f.create_carray(where='/', name='matrix', obj=matrix, filters=filters)

CPU times: user 627 ms, sys: 22.9 ms, total: 650 ms
Wall time: 652 ms


In [88]:
!ls -lh data/my_pytables_file.h5

-rw-r--r--  1 uemit.seren  staff    20M Nov 14 18:22 data/my_pytables_file.h5


### EArray
[Docs](http://www.pytables.org/usersguide/libref/homogenous_storage.html#earrayclassdescr)

- Enlargeable on one dimension (append)
- Compressible

In [89]:
# One (and only one) of the shape dimensions *must* be 0.
# The dimension being 0 means that the resulting EArray object can be extended along it.
# Multiple enlargeable dimensions are not supported right now.
shape = (0, num_columns)

with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    # you can create an EArray and fill it later, but you need to specify atom and shape
    f.create_earray(where='/', name='my_earray', atom=tb.Float32Atom(), shape=shape, filters=filters)

In [90]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='a') as f:
    earray = f.root.my_earray
    earray.append(sequence=matrix[0:1, :])

In [91]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='a') as f:
    earray = f.root.my_earray
    earray.append(sequence=matrix[1:5, :])

### Table
[Docs](http://www.pytables.org/usersguide/libref/structured_storage.html?highlight=table#tableclassdescr)

- Data can be heterogeneous (i.e. different shapes and different dtypes)
- The structure of a table is declared by its description
- multi-column searches

In order to emulate in Python records mapped to HDF5 C structs PyTables implements a special class so as to easily define all its fields and other properties. It's called `IsDescription`.

A *description* defines the table structure (basically, the *schema* of your table).

In [92]:
class Particle(tb.IsDescription):
    identity = tb.StringCol(itemsize=22, dflt=' ', pos=0)  # character String
    idnumber = tb.Int16Col(dflt=1, pos=1)  # short integer

In [93]:
print(Particle.columns)

{'identity': StringCol(itemsize=22, shape=(), dflt=b' ', pos=0), 'idnumber': Int16Col(shape=(), dflt=1, pos=1)}


In [94]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    table = f.create_table(where='/', name='my_table', description=Particle)

    for i in range(100):
        table.row['identity'] = 'I am {identity}'.format(identity=i)
        table.row['idnumber'] = i
        table.row.append()
    table.flush()  # Flush the table buffers to release memory and make sure are written to disk

In [95]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    table = f.create_table(where='/', name='my_table', description=Particle)
    
    for i in range(100):
        table.append([('I am {identity}'.format(identity=i), i)])
        
    table.flush()  # Flush the table buffers to release memory and make sure are written to disk

#### Traverse a HDF5 file with PyTables

In [96]:
with tb.open_file(filename='data/my_pytables_file.h5', mode='r') as f:
    for node in f.walk_nodes('/', classname='Table'):
        print('{}'.format(node._v_pathname))

/my_table


#### Expressions with NumExpr

In [97]:
x = np.random.uniform(low=1, high=5, size=num_rows).astype('float32')

In [98]:
%%time
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    carray = f.create_carray(where='/', name='carray_without_numexpr', atom=tb.Float32Atom(), shape=x.shape)
    carray[:] = x**3 + 0.5*x**2 - x

CPU times: user 23.7 ms, sys: 10.1 ms, total: 33.8 ms
Wall time: 33 ms


In [99]:
%%time
with tb.open_file(filename='data/my_pytables_file.h5', mode='w') as f:
    carray = f.create_carray(where='/', name='carray_with_numexpr', atom=tb.Float32Atom(), shape=x.shape)
    ex = tb.Expr('x**3 + 0.5*x**2 - x')
    ex.set_output(carray) # output will got to the CArray on disk
    ex.eval()

CPU times: user 13.6 ms, sys: 12.7 ms, total: 26.3 ms
Wall time: 16 ms


## NYC Yellow Taxi Dataset (2015)

On a regular laptop (Thinkpad X220 i5 10GB RAM) it took roughly:

 - **40 minutes** to read/store a **single CSV**
 - **8 hours** to read/store an entire **year**
 
 ![still-waiting-meme](images/still-waiting-meme.jpg "Still waiting")
 
 
 To see how to improve the runtime with PyTables check out [Giacomo Debidda's notebook](https://github.com/jackdbd/hdf5-pydata-munich/blob/master/hdf5_in_python.ipynb)

### Excercise: Create PyTables file

<div class="alert alert-success">
Write a function, <code>create_pytables_hdf5</code>, which takes an array of `participants` and their `reaction_times` as parameters and stores them in an HDF5 file as a PyTables table. 
</div>

<div class="alert alert-warning">
    You can use <a href="https://numpy.org/doc/stable/reference/generated/numpy.core.records.fromarrays.html">Numpy Structured arrays</a> to store the data in the PyTable table. Also make sure that you calculate the mean reaction times for each participant (the axis is important when running the numpy's mean function)
</div>

In [None]:
participants = np.load("data/experiment_participants.npy")
reaction_times = np.load("data/experiment_data.npy")

In [None]:
def create_pytables_hdf5(participants, reaction_times):
    """Create a HDF5 file using PyTables that contains a PyTable table.
       Make sure to:

      * table should be called 'experiment'
      * First column are participants
      * Second column are the mean reaction_time for each participant
      * Return the filename of the generated HDF5 file
      * Create an index on the participants column
      * Reaction_times column should be of type float32

    Parameters
    ----------
    participants : numpy array with shape (n,),
    reaction_times : numpy array with shape (n,)

    """
    # YOUR CODE HERE
    raise NotImplementedError()
    # YOUR CODE HERE
    return file_name

In [None]:
import os
from numpy.testing import assert_array_equal, assert_almost_equal
file_name = create_pytables_hdf5(participants,reaction_times)
assert os.path.exists(file_name)
with tb.open_file(file_name, 'r') as hf:
    assert len(hf.list_nodes('/')) == 1
    assert  hf.root.experiment is not None
    recarray = hf.root.experiment.read()
    assert hf.root.experiment.cols.participant.is_indexed
    assert hf.root.experiment.cols.mean_reaction_time.type == 'float32'
    assert_array_equal(participants, np.char.decode(hf.root.experiment.read(field='participant')))
    assert_almost_equal(np.mean(reaction_times,axis=1), hf.root.experiment.read(field='mean_reaction_time'),decimal=3)
    print("Success!")