### Corn crake custom parsing
- This dataset has:
    - A CSV of corn crake vocalizations
    - MP3 files for each vocalization, with several syllables per mp3, annotated by age, sex, recording group, date, etc
- This notebook creates a JSON corresponding to each WAV file, as well as a WAV file from the MP3
- Dataset origin:
    - https://link.springer.com/article/10.1007/s10336-017-1438-7
    - http://www.tierstimmenarchiv.de/webinterface/contents/treebrowser.php

In [1]:
from avgn.utils.general import prepare_env

In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np



In [4]:
import avgn
from avgn.custom_parsing.budka_corncrake import generate_wav_json
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
DATASET_ID = 'budka_crex_crex'

In [6]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-23_11-02-14'

In [7]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/animalsoundarchive/crex_crex/')
DSLOC

PosixPath('/mnt/cube/Datasets/animalsoundarchive/crex_crex')

In [8]:
fg_list = np.array(list(DSLOC.glob('*.mp3')))
len(fg_list), np.sort(fg_list)[-5:]

(233,
 array([PosixPath('/mnt/cube/Datasets/animalsoundarchive/crex_crex/Crex_crex_89582_short.mp3'),
        PosixPath('/mnt/cube/Datasets/animalsoundarchive/crex_crex/Crex_crex_89583_short.mp3'),
        PosixPath('/mnt/cube/Datasets/animalsoundarchive/crex_crex/Crex_crex_89584_short.mp3'),
        PosixPath('/mnt/cube/Datasets/animalsoundarchive/crex_crex/Crex_crex_89585_short.mp3'),
        PosixPath('/mnt/cube/Datasets/animalsoundarchive/crex_crex/Crex_crex_89586_short.mp3')],
       dtype=object))

In [9]:
vocalization_lib = pd.read_csv(DSLOC.parent / 'recording_df.csv')
voc_df = vocalization_lib[(vocalization_lib.species == 'Crex crex')  & (vocalization_lib.author == 'Budka, Michal')]

In [10]:
voc_df = voc_df[
    [
        "filename",
        "species",
        "latitude",
        "longitude",
        "altitude",
        "recording_date",
        "recording_time",
        "sex",
        "age",
        "sound_type",
        "collection",
        "filename_ext",
        "description",
        "duration",
        "notes",
        "unique_identifier",
        "bytes",
        "recording_type",
        "recording_equipment",
        "notes"
    ]
]

In [11]:
voc_df[:3]

Unnamed: 0,filename,species,latitude,longitude,altitude,recording_date,recording_time,sex,age,sound_type,collection,filename_ext,description,duration,notes,unique_identifier,bytes,recording_type,recording_equipment,notes.1
7401,Crex_crex_89354,Crex crex,49.30593,22.03909,531.0,8.6.2011,22:09,male,adult,song,J_Orn,Crex_crex_89354_short.mp3,Song of corncrake. Recording used in Budka M. ...,00:00:29,,J_Orn:Crex_crex_89354,471000.0,w,"Marantz PMD 620, Sennheiser ME 67",
7402,Crex_crex_89355,Crex crex,49.30919,22.03963,550.0,8.6.2011,22:55,male,adult,song,J_Orn,Crex_crex_89355_short.mp3,Song of corncrake. Recording used in Budka M. ...,00:00:25,,J_Orn:Crex_crex_89355,395000.0,w,"Marantz PMD 620, Sennheiser ME 67",
7403,Crex_crex_89356,Crex crex,49.30871,22.05282,553.0,8.6.2011,23:51,male,adult,song,J_Orn,Crex_crex_89356_short.mp3,Song of corncrake. Recording used in Budka M. ...,00:00:25,,J_Orn:Crex_crex_89356,405000.0,w,"Marantz PMD 620, Sennheiser ME 67",


### generate JSON, save WAV

In [12]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    parallel(
        delayed(generate_wav_json)(
            row,
            DT_ID,
            mp3_path=fg_list[row.filename_ext == np.array([i.name for i in fg_list])][
                0
            ],
        )
        for idx, row in tqdm(voc_df.iterrows(), total=len(voc_df))
    )

HBox(children=(IntProgress(value=0, max=233), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   15.3s





[Parallel(n_jobs=-1)]: Done 210 out of 233 | elapsed:   16.6s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done 233 out of 233 | elapsed:   17.4s finished
