### woodpecker custom parsing
- This dataset consists of 1669 wav files most of which consists of several syllables grouped into either 'song', 'call type' or 'drumming'. There are seven different species of vocalizers. The dataset is taken from XenoCanto The dataset consists of:
    - WAV files for vocalization that contains labels for species and vocalization. 
- This notebook creates a JSON corresponding to each WAV file.
- Dataset origin:
    - https://zenodo.org/record/574438#.XOnxJ9NKhTY

In [1]:
from avgn.utils.general import prepare_env

In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np



In [4]:
import avgn
from avgn.custom_parsing.picidae_woodpecker import generate_json
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-24_14-29-55'

In [6]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/Picidae/PicidaeDataset/')
DSLOC

PosixPath('/mnt/cube/Datasets/Picidae/PicidaeDataset')

In [7]:
all_wavs = list(DSLOC.expanduser().glob('*/*.wav'))
all_wavs = [i for i in all_wavs if i.stem[0] != '.']
len(all_wavs)

1669

In [8]:
wav_df = pd.DataFrame(columns = ['species', 'call_type', 'wavloc', 'origin'])
for wav_loc in tqdm(all_wavs):
    if wav_loc.parent.stem == 'Silence': continue
    species, call_type = wav_loc.parent.stem.split('-')
    wav_df.loc[len(wav_df)] = [species, call_type, wav_loc, wav_loc.stem.split('-')[0]]

HBox(children=(IntProgress(value=0, max=1669), HTML(value='')))




In [9]:
wav_df[:3]

Unnamed: 0,species,call_type,wavloc,origin
0,PicusViridis,song,/mnt/cube/Datasets/Picidae/PicidaeDataset/Picu...,XC252476
1,PicusViridis,song,/mnt/cube/Datasets/Picidae/PicidaeDataset/Picu...,XC281262
2,PicusViridis,song,/mnt/cube/Datasets/Picidae/PicidaeDataset/Picu...,XC281263


In [10]:
wav_df.species.unique()

array(['PicusViridis', 'DryocopusMartius', 'DendrocoposMedius',
       'JynxTorquilla', 'DendrocoposLeucotos', 'DendrocoposMinor',
       'DendrocoposMajor'], dtype=object)

### create json for wavs

In [11]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    parallel(
        delayed(generate_json)(
            row,
            DT_ID
        )
        for idx, row in tqdm(wav_df.iterrows(), total=len(wav_df))
    );

HBox(children=(IntProgress(value=0, max=1146), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1871s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0667s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 115 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1863s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:    4.4s





[Parallel(n_jobs=-1)]: Done 754 out of 1146 | elapsed:    4.4s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done 1146 out of 1146 | elapsed:    4.5s finished
