# WAVs in TileDB
Prototype for storing waveform data in TileDB


In [None]:
import scipy.io.wavfile as wf
import numpy as np
import tiledb
import os
import pathlib
from datetime import datetime

in_file = "../data/20230425_150920.WAV"
# Use the device ID as part of the file name, manually for now
out_file = "../data/20230425_150920_24F3190361DA539A.wav"
MB = 1024*1024

### Read WAV file
This is an ~12 sec file at 192khz, 16 bit int PCM

In [None]:
rate, data = wf.read(in_file)
secs = data.size//rate
print(f"rate: {rate}, data shape: {data.shape}, dtype: {data.dtype}, secs: {secs}")
data = data[0:secs*rate] #cut to 12 secs

In [None]:
# Repeat to create 1hr of data and save
n = 3600//secs
data_hr = np.tile(data, n)
wf.write(out_file, rate, data_hr)
print(f"1 hr of data ~ {data_hr.size*2 // MB}")

### Store in TileDB

[TileDB](https://docs.tiledb.com/main/) is an in-process DB for storing array data. Data is stored in a "tile" format for fast array access.
We can store data using one 2D array per deployment, where each unit is a row and the samples (time) are the columns. The tiling can then be configured according to common access patterns.


In [None]:
if tiledb.default_ctx() is None:
    cfg = tiledb.Ctx().config()
    cfg.update(
    {
        'py.init_buffer_bytes': 1024**2 * 50
    }
    )
    tiledb.default_ctx(cfg)

# Paths
dbdir = os.path.expanduser("~/tile/")
pathlib.Path(dbdir).mkdir(exist_ok=True)
arr_path = os.path.join(dbdir, "deployment1")

# constants
ATTRIB_SAMPLE = "sample"
META_DEPL_DATE = "Deployment_Date"
META_RATE = "Sampling_Rate"

# Define the array size for one deployment. These are a best guess at this point
# but can be made to match the actual data size once we have it.
RATE = 192000
MAX_DAYS = 7 #??
MAX_TIME = RATE * MAX_DAYS * 24 * 3600
EXTENT_HR = 3600 * RATE # make tiles 1 hr wide?
MAX_UNITS = 32

if not os.path.exists(arr_path):
    # Create the two dimensions: unit -> rows, time -> columns
    unit_dim = tiledb.Dim(name="unit", domain=(0, MAX_UNITS), tile=1, dtype=np.int64)
    time_dim = tiledb.Dim(name="time", domain=(0, MAX_TIME), tile=EXTENT_HR, dtype=np.int64)
    # Create a domain using the two dimensions
    dom1 = tiledb.Domain(unit_dim, time_dim)
    attrib_sample = tiledb.Attr(name=ATTRIB_SAMPLE, dtype=np.int16)
    schema = tiledb.ArraySchema(domain=dom1, sparse=False, attrs=[attrib_sample])
    tiledb.Array.create(arr_path, schema)

def samples_hr(hrs: int) -> int:
    return hrs*3600*RATE


In [None]:
schema = tiledb.ArraySchema.load(arr_path)
schema

In [None]:
# Simulate 3 hrs of data for 2 units
db_data = np.tile(data_hr, [2, 3])
print(db_data.shape)


In [None]:
%%time
# Store in the DB
with tiledb.open(arr_path, 'w') as A:
    # Data
    A[0:2,0:samples_hr(3)] = db_data
    # Metadata
    A.meta[META_DEPL_DATE] = str(datetime.now())
    A.meta[META_RATE] = RATE
    

In [None]:
%%time
A = tiledb.open(arr_path, 'r')
print(f"Deployment date: {A.meta[META_DEPL_DATE]}, sampling rate: {A.meta[META_RATE]}")
start = samples_hr(2)
MIN_5 = RATE*60*5
# Read 5 mins of data for two units
d = A[0:2,start:start+MIN_5][ATTRIB_SAMPLE]
print(f"Read shape: {d.shape}, size: {d.size*2//MB} MB")

In [None]:
%%time
# read next 5 mins
d = A[0:2, start+MIN_5:start+2*MIN_5][ATTRIB_SAMPLE]

In [None]:
A.close()

## TODO
- Figure out the best way to store:
    - The mapping of device IDs to the row indices
    - The clock drifts, which will be sparse.
- Investigate if the schema can be more efficient through different cell and tile orders and tile sizes