# Background tests for data formatting

## floating point error
Can we use 16-bit floats to represent waveform data with acceptable accuracy?

In [1]:
from __future__ import annotations
import numpy as np
from matplotlib import pyplot as plt
import xarray as xr
import numcodecs
from edge_analyzer import io
from pathlib import Path

path = 'test'

def recursive_size(path):
    path = Path(path)
    if path.is_file():
        return path.stat().st_size

    size = 0
    for p in Path(path).glob('**/*'):
        if p.is_file():
            size += p.stat().st_size

    return size

fs=15.36e6
x = io.simulated_awgn(duration=0.1, sample_rate=fs)
buf = x.tobytes()
arr = xr.DataArray(x)
ds = xr.Dataset({'iq_waveform': arr})
mem_size = (np.finfo(x.dtype).bits//8) * x.size

In [2]:
numcodecs.Blosc(cname='zfp')

Blosc(cname='zfp', clevel=5, shuffle=SHUFFLE, blocksize=0)

In [3]:
numcodecs.Blosc.

SyntaxError: invalid syntax (3911409300.py, line 1)

In [4]:
import blosc

ModuleNotFoundError: No module named 'blosc'

In [9]:
chunk_duration=1000e-3
compressors = (
    numcodecs.Blosc(cname="zstd", clevel=9, shuffle=-1),
    numcodecs.Blosc(cname="zlib", clevel=9, shuffle=-1),
    numcodecs.Blosc(cname="lz4", clevel=9, shuffle=-1),
    numcodecs.Blosc(cname="lz4hc", clevel=9, shuffle=-1),
    numcodecs.Blosc(cname="blosclz", clevel=9, shuffle=-1),
)

test = x.round(5)
buf = test.tobytes()
for compressor in compressors:
    out_size = len(compressor.encode(buf))

    print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')

Blosc(cname='zstd', clevel=9, shuffle=AUTOSHUFFLE, blocksize=0) - disk size: 10.4 MB, mem size: 12.3 MB, CR=1.179
Blosc(cname='zlib', clevel=9, shuffle=AUTOSHUFFLE, blocksize=0) - disk size: 10.5 MB, mem size: 12.3 MB, CR=1.168
Blosc(cname='lz4', clevel=9, shuffle=AUTOSHUFFLE, blocksize=0) - disk size: 12.3 MB, mem size: 12.3 MB, CR=1.000
Blosc(cname='lz4hc', clevel=9, shuffle=AUTOSHUFFLE, blocksize=0) - disk size: 12.3 MB, mem size: 12.3 MB, CR=1.000
Blosc(cname='blosclz', clevel=9, shuffle=AUTOSHUFFLE, blocksize=0) - disk size: 12.3 MB, mem size: 12.3 MB, CR=1.000


In [None]:
compressors = [
    numcodecs.zfpy.ZFPY(tolerance=tol)
    for tol in (1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2)
]

xmagphase = x.copy().view('float32')
xmagphase[::2] = np.abs(x).round(2)
xmagphase[1::2] = np.angle(x).round(2)

test = x.view('float32')
buf = x.tobytes()

print('\n\nReal-Imag')
for compressor in compressors:
    out = compressor.encode(x.view('float32'))

    %timeit -n1 -r1 compressor.encode(x.view('float32'))
    %timeit -n1 -r1 compressor.decode(out)
    out_size = len(out)

    print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')

print('\n\nMagnitude-Phase')
for compressor in compressors:
    out = compressor.encode(xmagphase)

    %timeit -n1 -r1 compressor.encode(xmagphase)
    %timeit -n1 -r1 compressor.decode(out)
    out_size = len(out)

    print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')



Real-Imag
70 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
117 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
ZFPY(mode=4, tolerance=1e-07, rate=-1, precision=-1) - disk size: 11.3 MB, mem size: 12.3 MB, CR=1.085
66.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
102 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
ZFPY(mode=4, tolerance=1e-06, rate=-1, precision=-1) - disk size: 9.8 MB, mem size: 12.3 MB, CR=1.255
63.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
91.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
ZFPY(mode=4, tolerance=1e-05, rate=-1, precision=-1) - disk size: 8.6 MB, mem size: 12.3 MB, CR=1.422
61 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
80.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
ZFPY(mode=4, tolerance=0.0001, rate=-1, precision=-1) - disk size: 7.5 MB, mem size: 12.3 MB, CR=1.641
58.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [None]:
chunk_duration=100e-3
compressor = numcodecs.Zstd(level=14)

chunks =  {'dim_0': round(fs*chunk_duration)}
ds.chunk(chunks).to_zarr(path, encoding={"iq_waveform": {"compressor": compressor}}, mode='w')

disk_size = recursive_size(f'{path}/iq_waveform')
print(f'disk size: {disk_size/1e6:0.1f} MB, mem size: {mem_size/1e6:0.1f} MB, CR={mem_size/disk_size:0.3f}')

disk size: 10.4 MB, mem size: 12.3 MB, CR=1.182


In [19]:
chunk_duration=100e-3
compressor = numcodecs.LZMA(preset=0)

test = x.round(5)
buf = test.tobytes()

out = compressor.encode(buf)
%timeit -n1 -r1 compressor.encode(buf)
%timeit -n1 -r1 compressor.decode(out)
out_size = len(out)

print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')

1.03 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
340 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
LZMA(format=1, check=-1, preset=0, filters=None) - disk size: 9.4 MB, mem size: 12.3 MB, CR=1.314


In [20]:
chunk_duration=100e-3
compressor = numcodecs.Zstd(level=17)

test = x.round(5)
buf = test.tobytes()

out = compressor.encode(buf)
%timeit -n1 -r1 compressor.encode(buf)
%timeit -n1 -r1 compressor.decode(out)
out_size = len(out)

print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')

1.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
25.9 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Zstd(level=17) - disk size: 8.8 MB, mem size: 12.3 MB, CR=1.402


In [23]:
chunk_duration=100e-3
compressor = numcodecs.BZ2(level=4)

test = x.round(5)
buf = test.tobytes()


out = compressor.encode(buf)
%timeit -n1 -r1 compressor.encode(buf)
%timeit -n1 -r1 compressor.decode(out)
out_size = len(out)

print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')

724 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
395 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
BZ2(level=4) - disk size: 9.7 MB, mem size: 12.3 MB, CR=1.273


In [24]:
chunk_duration=100e-3
compressor = numcodecs.GZip(level=19)

test = x.round(5)
buf = test.tobytes()

out = compressor.encode(buf)
%timeit -n1 -r1 compressor.encode(buf)
%timeit -n1 -r1 compressor.decode(out)
out_size = len(out)

print(f'{compressor} - disk size: {out_size/1e6:0.1f} MB, mem size: {len(buf)/1e6:0.1f} MB, CR={len(buf)/out_size:0.3f}')

ValueError: Invalid initialization option