### White munia custom parsing
- An unlabelled dataset of white munia vocalizations-
    - WAV files with individual ID
- This notebook creates a JSON corresponding to each WAV file (and Noise file where available).
- Dataset origin:
    - https://www.ncbi.nlm.nih.gov/pubmed/24284561
    - https://datadryad.org//resource/doi:10.5061/dryad.6pt8g

In [1]:
from avgn.utils.general import prepare_env

In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np



In [4]:
import avgn
from avgn.custom_parsing.katahira_white_munia import generate_json
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
DATASET_ID = 'katahira_white_munia'

In [16]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-26_10-52-13'

In [17]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/WhiteMunia')
DSLOC

PosixPath('/mnt/cube/Datasets/WhiteMunia')

In [36]:
WAVLIST = list((DSLOC / 'WM').expanduser().glob('*/[!.]*.*')) 
len(WAVLIST), WAVLIST[0]

(177, PosixPath('/mnt/cube/Datasets/WhiteMunia/WM/WM40/1.wav'))

In [40]:
wav_df = pd.DataFrame(columns=['wavloc', 'indv', 'wav_num'])
for wf in WAVLIST:
    indv = wf.parent.stem
    wav_num = wf.stem
    wav_df.loc[len(wav_df)] = [wf, indv, wav_num]

In [41]:
print(len(wav_df))
wav_df[:3]

177


Unnamed: 0,wavloc,indv,wav_num
0,/mnt/cube/Datasets/WhiteMunia/WM/WM01/1.wav,WM01,1
1,/mnt/cube/Datasets/WhiteMunia/WM/WM02/1.wav,WM02,1
2,/mnt/cube/Datasets/WhiteMunia/WM/WM02/2.wav,WM02,2


### Generate JSON for wav

In [42]:
for idx, row in tqdm(wav_df.iterrows(), total = len(wav_df)):
    generate_json(row, DT_ID)

HBox(children=(IntProgress(value=0, max=177), HTML(value='')))


