### Beaked whale custom parsing
- An labelled (but smaller) dataset of zebra finch vocalizations
    - .WAV files with individual labels
- This notebook creates a JSON corresponding to each WAV file (and Noise file where available).
- Dataset origin:
    - https://zenodo.org/record/3237218

In [1]:
from avgn.utils.general import prepare_env



In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np

In [4]:
import avgn
from avgn.custom_parsing.beaked_whale_hildebrand import generate_wav_json
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-09-30_14-41-47'

In [6]:
DSLOC = (
    avgn.utils.paths.Path(
        "/mnt/cube/tsainbur/Projects/github_repos/BeakedWhaleClassification/"
    )
    / "data"
    / "DSE230_version_Data"
    / "whale_data_15mb.np"
)
DSLOC

PosixPath('/mnt/cube/tsainbur/Projects/github_repos/BeakedWhaleClassification/data/DSE230_version_Data/whale_data_15mb.np')

In [7]:
rate = 200000

In [8]:
def unpackArray(x, data_type=np.int16):
    return np.frombuffer(x, dtype=data_type)


In [9]:
file_data = np.load(DSLOC.as_posix())

song_df = pd.DataFrame(
    file_data,
    columns=[
        "time",
        "species",
        "site",
        "rec_no",
        "bout_i",
        "peak2peak",
        "MSN",
        "MSP",
        "TPWS1",
        "MD1",
        "FD1",
        "TPWS2",
        "MD2",
        "FD2",
        "TPWS3",
        "MD3",
        "FD3",
    ],
)
song_df.MSN = [unpackArray(i, data_type=np.float64) for i in song_df.MSN.values]
song_df.MSP = [unpackArray(i, data_type=np.float64) for i in song_df.MSP.values]

In [10]:
song_df[:3]

Unnamed: 0,time,species,site,rec_no,bout_i,peak2peak,MSN,MSP,TPWS1,MD1,FD1,TPWS2,MD2,FD2,TPWS3,MD3,FD3
0,2010-08-10 11:45:36.626459,Cuviers,DT,1,5,50.2422,"[9.3197, -13.5086, -6.1229, 5.5962, -0.8494, 1...","[-37.7052, -40.2157, -64.4694, -69.3526, -68.2...",1,0,0,1,0,0,0,0,0
1,2010-08-10 11:46:06.004604,Cuviers,DT,1,5,55.0377,"[-2.2228, 6.5712, -14.2926, 8.2005, -1.0006, 2...","[-34.7333, -37.8125, -76.5551, -63.8601, -63.2...",1,0,0,1,0,0,0,0,0
2,2010-08-10 12:11:25.866437,Cuviers,DT,1,5,44.9439,"[5.8497, 0.9312, -1.3678, -1.0723, -4.6082, 0....","[-39.6219, -42.9117, -70.0049, -80.345, -82.53...",1,0,0,1,0,0,0,0,0


In [11]:
len(song_df)

4175

### create JSON and WAV for each file

In [12]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    parallel(
        delayed(generate_wav_json)(
            row, rate, DT_ID
        )
        for idx, row in tqdm(song_df.iterrows(), total = len(song_df))
    )

HBox(children=(IntProgress(value=0, max=4175), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1947s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0801s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  91 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1718s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Done 260 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 400 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:




[Parallel(n_jobs=-1)]: Done 4175 out of 4175 | elapsed:    8.2s finished
