# Databases Benchmark

This note is a toy model to measure the performance cost of databases:
Sqlite3, Shelve, Numpy, and PyTables.

3 different sizes of arrays will be created with numpy random sample: x (2D), dx (4D), rdx (3D).

In [1]:
import numpy as np
import io
import time
import tables
import shelve
import sqlite3

In [2]:
# Common variables

s = 219 
I, d = 64, 30 # The size of the arrays

n = np.array([I])
x, dx, rdx = [], [], []
for i in range(219):
    x.append(np.random.random_sample([I, d]))
    dx.append(np.random.random_sample([I, I, d, 3]))
    rdx.append(np.random.random_sample([I, d, 6]))

In [3]:
### Shelve ###

# Insert
db = shelve.open("database")
t0 = time.time()
for i in range(s):
    db[str(i)] = [x[i], dx[i], rdx[i]]
    
    if i == 5:
        print("Original i-th array")
        print(x[i][5, :10])
        print(dx[i][5, 5, :10, 1])
        print(rdx[i][5, :10, 4])
t1 = time.time()
db.close()

# Reading
db = shelve.open("database")
_t0 = time.time()
for i in range(s):
    _d = db[str(i)]
    _x = _d[0]
    _dx = _d[1]
    _rdx = _d[2]
_t1 = time.time()

print("Extracted i-th array")
print(db['5'][0][5, :10])
print(db['5'][1][5, 5, :10, 1])
print(db['5'][2][5, :10, 4])
print("The time it takes to save shelve database:", t1-t0, "s")
print("The time it takes to load shelve database:", _t1-_t0, "s")
db.close()

Original i-th array
[0.33948743 0.48868521 0.51485682 0.08198881 0.30296833 0.65542285
 0.16506783 0.58281834 0.11599848 0.94128936]
[0.85511308 0.20217528 0.62297042 0.78892205 0.52779912 0.61027226
 0.71761675 0.19223772 0.92188666 0.88631914]
[0.06427396 0.12132217 0.52165158 0.95120372 0.17288603 0.02940254
 0.91761227 0.75852959 0.04859202 0.79117988]
Extracted i-th array
[0.33948743 0.48868521 0.51485682 0.08198881 0.30296833 0.65542285
 0.16506783 0.58281834 0.11599848 0.94128936]
[0.85511308 0.20217528 0.62297042 0.78892205 0.52779912 0.61027226
 0.71761675 0.19223772 0.92188666 0.88631914]
[0.06427396 0.12132217 0.52165158 0.95120372 0.17288603 0.02940254
 0.91761227 0.75852959 0.04859202 0.79117988]
The time it takes to save shelve database: 1.0530524253845215 s
The time it takes to load shelve database: 0.7090840339660645 s


In [4]:
### Sqlite3 ###

def adapt_array(arr):
    return arr.tobytes()

def convert_array(text):
    return np.frombuffer(text)

# Converts np.array to TEXT when inserting
sqlite3.register_adapter(np.ndarray, adapt_array)

# Converts TEXT to np.array when selecting
sqlite3.register_converter("array", convert_array)

conn = sqlite3.connect("sqldatabase.db", detect_types=sqlite3.PARSE_DECLTYPES)
c = conn.cursor()

try:
    c.execute("create table test (num integer, x array, dx array, rdx array)")
except:
    pass

# Insert
t0 = time.time()
for i in range(s):
    c.execute("insert into test (num, x, dx, rdx) values (?, ?, ?, ?)", (i, x[i], dx[i], rdx[i]))
t1 = time.time()

# Reading
_t0 = time.time()
for i in range(s):
    c.execute("select * from test where num=?", (i,))
    data = c.fetchone()
    _x = np.reshape(data[1], x[i].shape)
    _dx = np.reshape(data[2], dx[i].shape)
    _rdx = np.reshape(data[3], rdx[i].shape)
_t1 = time.time()
c.execute("select * from test where num=?", (5,))
data = c.fetchone()
print(np.reshape(data[1], x[5].shape)[5, :10])
print(np.reshape(data[2], dx[5].shape)[5, 5, :10, 1])
print(np.reshape(data[3], rdx[5].shape)[5, :10, 4])
print("The time it takes to save sqlite3 database:", t1-t0, "s")
print("The time it takes to load sqlite3 database:", _t1-_t0, "s")

[0.33948743 0.48868521 0.51485682 0.08198881 0.30296833 0.65542285
 0.16506783 0.58281834 0.11599848 0.94128936]
[0.85511308 0.20217528 0.62297042 0.78892205 0.52779912 0.61027226
 0.71761675 0.19223772 0.92188666 0.88631914]
[0.06427396 0.12132217 0.52165158 0.95120372 0.17288603 0.02940254
 0.91761227 0.75852959 0.04859202 0.79117988]
The time it takes to save sqlite3 database: 1.9592840671539307 s
The time it takes to load sqlite3 database: 0.9866747856140137 s


In [5]:
### Numpy ###

_data = []
for i in range(s):
    _data.append({'x': x[i], 'dx': dx[i], 'rdx': rdx[i]})

np.save("database.npy", _data)

t0 = time.time()
data = np.load("database.npy", allow_pickle=True)
for i in range(s):
    _x = data[i]['x']
    _dx = data[i]['dx']
    _rdx = data[i]['rdx']
t1 = time.time()

print(data[5]['x'][5, :10])
print(data[5]['dx'][5, 5, :10, 1])
print(data[5]['rdx'][5, :10, 4])
print("The time it takes to load numpy database:", t1-t0, "s")

[0.33948743 0.48868521 0.51485682 0.08198881 0.30296833 0.65542285
 0.16506783 0.58281834 0.11599848 0.94128936]
[0.85511308 0.20217528 0.62297042 0.78892205 0.52779912 0.61027226
 0.71761675 0.19223772 0.92188666 0.88631914]
[0.06427396 0.12132217 0.52165158 0.95120372 0.17288603 0.02940254
 0.91761227 0.75852959 0.04859202 0.79117988]
The time it takes to load numpy database: 3.6811532974243164 s


In [6]:
### Numpy Alt ###

_x, _dx, _rdx = [], [], []
for i in range(s):
    _x.append(x[i])
    _dx.append(dx[i])
    _rdx.append(rdx[i])

#np.save("x.npy", _x)
np.save("dx.npy", _dx)
#np.save("rdx.npy", _rdx)

t0 = time.time()
for i in range(s):    
    #_x = np.load("x.npy", allow_pickle=True, mmap_mode='r')
    _dx = np.load("dx.npy", allow_pickle=True, mmap_mode='r')
    _DX = _dx[i]
    #_rdx = np.load("rdx.npy", allow_pickle=True, mmap_mode='r')
t1 = time.time()

print(_dx[5][5, 5, :10, 1])
print("The time it takes to load numpy database:", t1-t0, "s")

[0.85511308 0.20217528 0.62297042 0.78892205 0.52779912 0.61027226
 0.71761675 0.19223772 0.92188666 0.88631914]
The time it takes to load numpy database: 0.0858304500579834 s


In [7]:
### PyTables ###

hdf5_path = "test.hdf5"

# Write data
hdf5_file = tables.open_file(hdf5_path, mode='w')
#filters = tables.Filters(complevel=5, complib='blosc')

t0 = time.time()
earray1 = hdf5_file.create_earray(
    hdf5_file.root,
    'x', 
    tables.Atom.from_dtype(x[0].dtype), 
    shape=(0, 30), 
    #filters=filters,
    expectedrows=14000
    )
earray2 = hdf5_file.create_earray(
    hdf5_file.root,
    'dx', 
    tables.Atom.from_dtype(x[0].dtype), 
    shape=(0, 64, 30, 3), 
    #filters=filters,
    expectedrows=14000
    )
earray3 = hdf5_file.create_earray(
    hdf5_file.root,
    'rdx',
    tables.Atom.from_dtype(x[0].dtype),
    shape=(0, 30, 6),
    #filters=filters,
    expectedrows=14000
    )
numbers = np.zeros(s, dtype=int)
for i in range(s):
    numbers[i] = I
    earray1.append(x[i])
    earray2.append(dx[i])
    earray3.append(rdx[i])

hdf5_file.close()
    
t1 = time.time()

print("The time it takes to save hdf5 database:", t1-t0, "s")

hdf5_file = tables.open_file(hdf5_path, mode='r')
_t0 = time.time()
for i in range(s):
    id_0 = sum(numbers[:i])
    id_1 = sum(numbers[:i+1])
    _x = hdf5_file.root.x[id_0:id_1, :]
    _dx = hdf5_file.root.dx[id_0:id_1, :, :, :]
    _rdx = hdf5_file.root.rdx[id_0:id_1, :, :]
    if i == 5:
        print(_x[5, :10])
_t1 = time.time()
hdf5_file.close()
print("The time it takes to load hdf5 database:", _t1-_t0, "s")

The time it takes to save hdf5 database: 0.27224040031433105 s
[0.33948743 0.48868521 0.51485682 0.08198881 0.30296833 0.65542285
 0.16506783 0.58281834 0.11599848 0.94128936]
The time it takes to load hdf5 database: 0.3594202995300293 s


### Conclusion

Shelve is not very stable. For every run of shelve loading, it will give a range of time. In my computer, it ranges from 0.6 - 2.3 s.

Sqlite3 is more stable. In which, the loading time is at about 1 second.

Numpy alt provides the best time. However, it may not be feasible in many application since dx may not have the same size for its every member.

It seems like PyTables provide the best option in term of flexibility and loading time at 0.3 s (max=0.6s).