In [1]:
%load_ext autoreload
%autoreload 2

In [36]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm, trange
from screaming_rocks import RATE, SENSOR_IDS
from screaming_rocks import dask_file_reading as dfr

In [7]:
# Update this with the location to your own data
data_folder = '/Volumes/data8pmb/EB2/SRM/'

We can eaily read small amounts of data as numpy arrays - here we read one second of data from the Sensor with ID 90414, starting at minute 25

In [8]:
print(dfr.read_batch(90414, 25 * 60, 1, data_folder=data_folder))

[[33531 32771 32599 32756]
 [33529 32757 32602 32789]
 [33530 32710 32705 32794]
 ...
 [33515 32853 32857 32756]
 [33534 32851 32842 32694]
 [33494 32898 32754 32694]]


We can also read this in as a Pandas DataFrame, with correct time series indexing and columns

In [9]:
dfr.read_batch_as_pandas(90414, 25 * 60, 1, data_folder=data_folder)

Unnamed: 0_level_0,90414,90414,90414,90414
Unnamed: 0_level_1,0,1,2,3
00:25:00,33531,32771,32599,32756
00:25:00.000000,33529,32757,32602,32789
00:25:00.000000,33530,32710,32705,32794
00:25:00.000000,33552,32675,32796,32820
00:25:00.000000,33491,32746,32823,32817
...,...,...,...,...
00:25:00.999999,33482,32834,32897,32737
00:25:00.999999,33513,32869,32829,32731
00:25:00.999999,33515,32853,32857,32756
00:25:00.999999,33534,32851,32842,32694


If we want to read more data, we need to use Dask to handle out-of-memory computation, which is made easy using `dfr.read_as_dask`.

I.e. if we want to read five minutes of data, starting at minute 20 (batching into pandas DataFrames of one second):

In [15]:
dfr.read_as_dask(90414, 20 * 60, 5 * 60, batch_size=2, data_folder=data_folder)

Unnamed: 0_level_0,90414,90414,90414,90414
Unnamed: 0_level_1,0,1,2,3
npartitions=150,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,uint16,uint16,uint16,uint16
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


We then can execute lazy computations on this data (warning, this takes a LONG time):

In [60]:
dfr.read_as_dask(90414, 20 * 60, 6, batch_size=2, data_folder=data_folder).sum()

ValueError: Names should be list-like for a MultiIndex

In [76]:
from datetime import datetime
meta = {}
ftpl = 'rct-uop-{:06d}.data.{:05d}.wve'
parse_funcs = {
    'Number of channels': int,
    'Volt range (V)': float,
    'Sampling rate (Hz)': int,
    'Waveform Format': int,
    'Bit Range': int,
    'End Byte': np.int32,
    'Start DateTime': lambda v: datetime.strptime(v, "%d%m%Y %H%M%S.%f"),
    'End DateTime': lambda v: datetime.strptime(v, "%d%m%Y %H%M%S.%f"),
}
nullfunc = lambda v: v
for sensor_id in SENSOR_IDS:
    for minute in range(40):
        if not os.path.isfile(os.path.join(data_folder, ftpl.format(sensor_id, minute))):
            continue
        with open(os.path.join(data_folder, ftpl.format(sensor_id, minute)), 'r') as f:
            lines = f.readlines()
            meta[(sensor_id, minute)] = {i: parse_funcs.get(i, nullfunc)(j) for i, j in (list(map(str.strip, i.split(':'))) for i in lines)}

            keys = list(meta.keys())
idx = pd.MultiIndex.from_tuples(keys, names=('sensor_id', 'minute'))
values = [meta[k] for k in keys]
df = pd.Series(values, index=idx).apply(pd.Series)
df.to_csv('minute_metadata.csv')

In [80]:
df['End Byte'].value_counts()

486539264    90
167772160     3
Name: End Byte, dtype: int64