### Humpback whale custom parsing
- This dataset has vocalizations from a single individual. FIles include :
    - .BOX (tsv) of vocalization times and frequences
    - .wav vocalziations corresponding to the .box vocalizations
- This notebook creates a JSON corresponding to each WAV file, as well as a WAV file from the MP3
- Dataset origin:
    - https://www.mobysound.org/mysticetes.html

In [1]:
from avgn.utils.general import prepare_env

In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np



In [4]:
import avgn
from avgn.custom_parsing.mobysound_humpback import (
    load_labs,
    find_longest_nonvocal_stretch,
    generate_noise_and_json,
)
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
DATASET_ID = 'mobysound_humpback_whale'

In [6]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-23_20-56-52'

In [7]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/mobysound/humpback-01/')
DSLOC

PosixPath('/mnt/cube/Datasets/mobysound/humpback-01')

In [8]:
wavs = list(DSLOC.glob('*.wav'))
labels = list(DSLOC.glob('*.box'))
len(wavs), len(labels)

(14, 14)

### load labels

In [9]:
rate = 4000

In [10]:
label_df = load_labs(labels)
print(len(label_df))
label_df[:3]

2310


Unnamed: 0,index,start_time,end_time,low_freq,high_freq,SNR,file
0,0,1.0,2.67,28.6175,1996.070068,23.104601,940305-1007
1,1,4.6,5.94,71.543701,2024.6875,34.156601,940305-1007
2,2,6.43,8.2,21.4631,2003.224365,36.505402,940305-1007


### generate JSON

In [11]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    parallel(
        delayed(generate_noise_and_json)(
            bout_number,
            fn,
            DT_ID,
            wavloc=np.array(wavs)[np.array([i.stem for i in wavs]) == fn][0],
            file_df=label_df[label_df.file == fn].sort_values(by="start_time"),
        )
        for bout_number, fn in tqdm(
            enumerate(label_df.file.unique()), total=len(label_df.file.unique())
        )
    )

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.





[Parallel(n_jobs=-1)]: Done   3 out of  14 | elapsed:    3.2s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done   5 out of  14 | elapsed:    3.2s remaining:    5.8s
[Parallel(n_jobs=-1)]: Done   7 out of  14 | elapsed:    3.3s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   9 out of  14 | elapsed:    3.3s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  11 out of  14 | elapsed:    3.4s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:    3.4s finished
