### Bengalese-finch custom parsing
- A labelled dataset of bengalese finch syllables:
    - XML files with annotations corresponding to each WAV
    - WAV files for vocalization
- This notebook creates a JSON corresponding to each WAV file (and Noise file where available).
- Dataset origin:
    - https://figshare.com/articles/BirdsongRecognition/3470165

In [1]:
from avgn.utils.general import prepare_env

In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np



In [4]:
import avgn
from avgn.custom_parsing.koumura_bengalese_finch import generate_json, Koumura_Okanoya_parser
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
DATASET_ID = 'koumura_bengalese_finch'

In [6]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-26_10-25-16'

In [7]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/BengaleseFinch/Koumura_Okanoya-paper_data/')
DSLOC

PosixPath('/mnt/cube/Datasets/BengaleseFinch/Koumura_Okanoya-paper_data')

In [8]:
wav_list = list(DSLOC.glob('Bird*/Wave/*.wav'))
len(wav_list), np.sort(wav_list)[-2:]

(2968,
 array([PosixPath('/mnt/cube/Datasets/BengaleseFinch/Koumura_Okanoya-paper_data/Bird9/Wave/98.wav'),
        PosixPath('/mnt/cube/Datasets/BengaleseFinch/Koumura_Okanoya-paper_data/Bird9/Wave/99.wav')],
       dtype=object))

In [9]:
annotation_files = list(DSLOC.glob('Bird*/Annotation.xml'))
len(annotation_files), np.sort(annotation_files)[-2:]

(11,
 array([PosixPath('/mnt/cube/Datasets/BengaleseFinch/Koumura_Okanoya-paper_data/Bird8/Annotation.xml'),
        PosixPath('/mnt/cube/Datasets/BengaleseFinch/Koumura_Okanoya-paper_data/Bird9/Annotation.xml')],
       dtype=object))

### Generate JSON for each wav

In [10]:
song_df = Koumura_Okanoya_parser(annotation_files, wav_list)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1964), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2110), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1351), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1486), HTML(value='')))

HBox(children=(IntProgress(value=0, max=412), HTML(value='')))

HBox(children=(IntProgress(value=0, max=572), HTML(value='')))

HBox(children=(IntProgress(value=0, max=419), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1854), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1495), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2501), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1238), HTML(value='')))




In [11]:
len(song_df)

15391

In [12]:
song_df[:3]

Unnamed: 0,bird,WaveFileName,Position,Length,NumNote,NotePositions,NoteLengths,NoteLabels
0,Bird4,0.wav,32000,60880,13,"[5056, 10240, 15648, 26240, 29760, 33952, 3708...","[2304, 2464, 2848, 2848, 1696, 2336, 2528, 265...","[0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2]"
1,Bird4,0.wav,92880,56064,14,"[512, 4288, 8480, 12896, 17888, 23168, 28096, ...","[2848, 3008, 2880, 2336, 2240, 2944, 3264, 176...","[3, 3, 3, 4, 4, 0, 0, 1, 2, 2, 2, 2, 2, 2]"
2,Bird4,0.wav,152624,51312,13,"[704, 5248, 10240, 15520, 19456, 22112, 25856,...","[2880, 2496, 2304, 3264, 1920, 2912, 2752, 262...","[3, 4, 4, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3]"


In [13]:
for bird in tqdm(np.unique(song_df.bird)):
    bird_df = song_df[song_df.bird == bird]
    with Parallel(n_jobs=-1, verbose=10) as parallel:
        parallel(
            delayed(generate_json)(
                DSLOC, DT_ID, bird, wfn, wfn_df=bird_df[bird_df.WaveFileName == wfn]
            )
            for wfn in tqdm(bird_df.WaveFileName.unique())
        )

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))

HBox(children=(IntProgress(value=0, max=135), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1941s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  88 out of 135 | elapsed:    5.5s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done 102 out of 135 | elapsed:    5.5s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done 116 out of 135 | elapsed:    5.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    5.5s finished


HBox(children=(IntProgress(value=0, max=315), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0166s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0166s.) Setting batch_size=578.
[Parallel(n_jobs=-1)]: Done  12 out of 315 | elapsed:    0.3s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done 315 out of 315 | elapsed:    2.9s finished


HBox(children=(IntProgress(value=0, max=94), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0385s.) Setting batch_size=10.
[Parallel(n_jobs=-1)]: Done   7 out of  94 | elapsed:    0.1s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  17 out of  94 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  27 out of  94 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  37 out of  94 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  94 out of  94 | elapsed:    0.2s finished


HBox(children=(IntProgress(value=0, max=339), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0208s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0208s.) Setting batch_size=344.
[Parallel(n_jobs=-1)]: Done 339 out of 339 | elapsed:    2.4s finished


HBox(children=(IntProgress(value=0, max=402), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0238s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0238s.) Setting batch_size=268.
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 402 out of 402 | elapsed:    2.1s finished


HBox(children=(IntProgress(value=0, max=441), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0131s.) Setting batch_size=30.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0131s.) Setting batch_size=914.
[Parallel(n_jobs=-1)]: Done 441 out of 441 | elapsed:    2.9s finished


HBox(children=(IntProgress(value=0, max=335), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0135s.) Setting batch_size=28.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0135s.) Setting batch_size=828.
[Parallel(n_jobs=-1)]: Done 335 out of 335 | elapsed:    2.3s finished


HBox(children=(IntProgress(value=0, max=235), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0211s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0211s.) Setting batch_size=340.
[Parallel(n_jobs=-1)]: Done 235 out of 235 | elapsed:    1.8s finished


HBox(children=(IntProgress(value=0, max=310), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0247s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0247s.) Setting batch_size=258.
[Parallel(n_jobs=-1)]: Done   7 out of 310 | elapsed:    0.3s remaining:   14.0s
[Parallel(n_jobs=-1)]: Done 310 out of 310 | elapsed:    2.4s finished


HBox(children=(IntProgress(value=0, max=142), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0717s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  35 out of 142 | elapsed:    0.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  80 out of 142 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 142 out of 142 | elapsed:    0.3s finished


HBox(children=(IntProgress(value=0, max=217), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0116s.) Setting batch_size=34.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0116s.) Setting batch_size=1168.





[Parallel(n_jobs=-1)]: Done 217 out of 217 | elapsed:    1.8s finished
