### Marmoset vocalization dataset custom parsing
- This dataset has:
    - A number of WAVs where naming convention stores the individuals vocalizing
    - Corresponding .mat files with the timing of each phee/call and the individual making the vocalization
- This notebook extracts periods of vocalization into new WAV files, and creates a corresponding JSON and TextGrid for each WAV with annotation information
- Dataset origin:
    - Recieved via correspondance with Miller Lab

In [1]:
from avgn.utils.general import prepare_env

In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json



In [4]:
import avgn
from avgn.custom_parsing.miller_marmoset import (
    parse_marmoset_data,
    parse_marmoset_calls,
    annotate_bouts,
    segment_wav_into_bouts
)
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-17_22-42-12'

In [6]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/Marmosets/FromMillerLab')

In [7]:
wavs = list(DSLOC.glob('*.wav'))
len(wavs), wavs[:3]

(186,
 [PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/han.todd.170621.wav'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/ares_spn_230217_203.wav'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/ares_ant_010317_33.wav')])

In [8]:
matfiles = list(DSLOC.glob("*.mat"))
len(matfiles), matfiles[:3]

(82,
 [PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/apollo_angel_140217.mat'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/jasmine.hermes.170622.mat'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/aladdin_banana_060317.mat')])

### Parse data into dataframe

In [9]:
wav_df = parse_marmoset_data(wavs, _filetype = "wav")
print(len(wav_df))
display(wav_df[:3])

183


Unnamed: 0,monkey1,monkey2,date,date_idx,wav_loc
0,han,todd,170621,,/mnt/cube/Datasets/Marmosets/FromMillerLab/han...
1,ares,spn,230217,203.0,/mnt/cube/Datasets/Marmosets/FromMillerLab/are...
2,ares,ant,10317,33.0,/mnt/cube/Datasets/Marmosets/FromMillerLab/are...


In [10]:
mf_df = parse_marmoset_data(matfiles, _filetype = "mat")
print(len(mf_df))
display(mf_df[:3])

81


Unnamed: 0,monkey1,monkey2,date,date_idx,mat_loc
0,apollo,angel,140217,,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...
1,jasmine,hermes,170622,,/mnt/cube/Datasets/Marmosets/FromMillerLab/jas...
2,aladdin,banana,60317,,/mnt/cube/Datasets/Marmosets/FromMillerLab/ala...


In [11]:
# merge dataframes
mf_df = pd.merge(
    mf_df,
    wav_df,
    how="left",
    left_on=["monkey1", "monkey2", "date", "date_idx"],
    right_on=["monkey1", "monkey2", "date", "date_idx"],
    suffixes=(False, False),
)
# remove unlabelled wavs
mf_df = mf_df[mf_df.wav_loc.isnull() == False]
print(len(mf_df))
display(mf_df[:3])

80


Unnamed: 0,monkey1,monkey2,date,date_idx,mat_loc,wav_loc
0,apollo,angel,140217,,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...
1,jasmine,hermes,170622,,/mnt/cube/Datasets/Marmosets/FromMillerLab/jas...,/mnt/cube/Datasets/Marmosets/FromMillerLab/jas...
2,aladdin,banana,60317,,/mnt/cube/Datasets/Marmosets/FromMillerLab/ala...,/mnt/cube/Datasets/Marmosets/FromMillerLab/ala...


### Parse matfiles into syllables

In [12]:
syllable_df = pd.concat(
    Parallel(n_jobs=-1, verbose=10)(
        delayed(parse_marmoset_calls)(row)
        for idx, row in tqdm(mf_df.iterrows(), total=len(mf_df))
    )
)

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.5s





[Parallel(n_jobs=-1)]: Done  42 out of  80 | elapsed:    6.3s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done  51 out of  80 | elapsed:    6.9s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  60 out of  80 | elapsed:    7.3s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  69 out of  80 | elapsed:    7.8s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  78 out of  80 | elapsed:    8.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    9.5s finished


In [13]:
print(len(syllable_df))
display(syllable_df[:3])

14295


Unnamed: 0,indv,partner,date,call_type,wav_loc,call_num,pulse_n,pulse_start,pulse_end
0,apollo,angel,140217,phee,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,0,0,14.038007,16.171723
1,apollo,angel,140217,phee,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,1,0,107.359792,108.729595
2,apollo,angel,140217,phee,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,1,1,109.060383,110.417463


### segment WAVs into 'bouts'
- There are a lot of periods of time in the original datasets that are not occupied by any vocalizations. Here, we segment out those time periods and create new sub-WAVs. For each sub-WAV, we generate a JSON with metadata and segment information. 

In [14]:
# HParams is just a python object storing a set of hyperparameters.
hparams = avgn.utils.general.HParams(
    bout_segmentation_min_s = 30,  # Minimum amount of seconds between vocal activity required to split a wavfile
    bout_pad_s = 5, # how much time to pad this bout with on either side
    # noise clip
    get_noise_clip = True, # if a noise clip preceding the vocalization should be grabbed to help reduce noise in analysis
    max_noise_clip_size_s = 10, # how large the noise clip can be
    min_noise_clip_size_s = 1, # how small the noise clip can be
    
)

In [15]:
bout_dfs =  Parallel(n_jobs=-1, verbose=10)(
        delayed(segment_wav_into_bouts)(
            (
                syllable_df[syllable_df.wav_loc == wav_loc]
                .sort_values(by=["pulse_start"])
                .reset_index()
            ),
            hparams,
        )
        for wav_loc in tqdm(syllable_df.wav_loc.unique())
    )
bout_dfs = [item for sublist in bout_dfs for item in sublist]

HBox(children=(IntProgress(value=0, max=78), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1499s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1499s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.0s





[Parallel(n_jobs=-1)]: Done  78 out of  78 | elapsed:    1.9s finished


### Save bouts and JSON files

In [16]:
Parallel(n_jobs=-1, verbose=10)(
    delayed(annotate_bouts)(
        DT_ID,
        bout_number,
        syllable_df[syllable_df.wav_loc == bout_df.iloc[0].wav_loc]
        .sort_values(by=["pulse_start"])
        .reset_index(),
        bout_df,
        hparams,
    )
    for bout_number, bout_df in tqdm(enumerate(bout_dfs), total=len(bout_dfs))
);

HBox(children=(IntProgress(value=0, max=769), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1786s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1786s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Batch computation too slow (11.0372s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.9712s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  95 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 150 




[Parallel(n_jobs=-1)]: Done 769 out of 769 | elapsed:  2.8min finished
