### Canary vocalization dataset custom parsing
- This dataset has:
    - A number of WAVs where naming convention stores the individuals vocalizing and datetime of vocalization
    - Corresponding .TextGrid files with phrase identity
- This notebook extracts periods of vocalization into new WAV files, and creates a corresponding JSON and TextGrid for each WAV with annotation information
- Dataset origin:
    - Recieved via correspondance with Gardner Lab

In [1]:
DATASET_ID = 'canary'

In [2]:
from avgn.utils.general import prepare_env



In [3]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [4]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json
import numpy as np

In [5]:
import avgn
from avgn.custom_parsing.gardner_canary import (
    get_phrases,
    gen_wav_json
)
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [6]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-10-07_12-03-35'

In [7]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/canary/FromYarden2018')

In [8]:
TGLIST = np.array(list((DSLOC).expanduser().glob('TextGrids/*.TextGrid')))
TGLIST = np.array([i for i in TGLIST if i.stem[0] != '.'])
TGLIST[0], len(TGLIST)

(PosixPath('/mnt/cube/Datasets/canary/FromYarden2018/TextGrids/bird1_0008_2004_03_06_08_22.TextGrid'),
 2556)

In [9]:
WAVLIST = np.array(list((DSLOC).expanduser().glob('*/*.wav')))
WAVLIST[0], len(WAVLIST)

(PosixPath('/mnt/cube/Datasets/canary/FromYarden2018/bird_3/bird3_0067_2004_03_07_15_36.wav'),
 3776)

### load TextGrid data

In [10]:
wav_stems = np.array([i.stem for i in WAVLIST])

In [11]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    phrase_df = parallel(
        delayed(get_phrases)(tg, WAVLIST, wav_stems)
        for tg in tqdm(TGLIST)
    )
phrase_df = pd.concat(phrase_df)

HBox(children=(IntProgress(value=0, max=2556), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:  




[Parallel(n_jobs=-1)]: Done 2556 out of 2556 | elapsed:  2.9min finished


In [12]:
phrase_df[:3]

Unnamed: 0,indv,rendition,datetime,wavloc,tgloc,phrase_num,phrase_start,phrase_end,phrase_label
0,bird1,8,2004-03-06 08:22:00,/mnt/cube/Datasets/canary/FromYarden2018/bird_...,/mnt/cube/Datasets/canary/FromYarden2018/TextG...,0,0.062792,4.417292,1
1,bird1,8,2004-03-06 08:22:00,/mnt/cube/Datasets/canary/FromYarden2018/bird_...,/mnt/cube/Datasets/canary/FromYarden2018/TextG...,1,4.438958,4.853333,11
2,bird1,8,2004-03-06 08:22:00,/mnt/cube/Datasets/canary/FromYarden2018/bird_...,/mnt/cube/Datasets/canary/FromYarden2018/TextG...,2,4.875,5.893333,5


### Reformat and create JSON

In [13]:
save_wav=False # should we generate a new wav file, or just reference the original?

In [14]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    parallel(
        delayed(gen_wav_json)(wf, phrase_df[phrase_df.wavloc == wf].sort_values(by='phrase_num'), DT_ID, save_wav=save_wav)
        for wf in tqdm(phrase_df.wavloc.unique())
    );

HBox(children=(IntProgress(value=0, max=2556), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  27 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:  




[Parallel(n_jobs=-1)]: Done 2556 out of 2556 | elapsed:  2.4min finished
