### Marmoset vocalization dataset custom parsing
- This dataset has:
    - A number of WAVs where naming convention stores the individuals vocalizing
    - Corresponding .mat files with the timing of each phee/call and the individual making the vocalization
- This notebook extracts periods of vocalization into new WAV files, and creates a corresponding JSON and TextGrid for each WAV with annotation information

In [1]:
dataset_id = 'marmoset'

In [2]:
from avgn.utils.general import prepare_env

In [3]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [4]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json



In [5]:
#import avgn_paper as avgn

In [6]:
import avgn

In [7]:
from avgn.custom_parsing.miller_marmoset import (
    parse_marmoset_data,
    parse_marmoset_calls,
)
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [8]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/Marmosets/FromMillerLab')

In [9]:
wavs = list(DSLOC.glob('*.wav'))
len(wavs), wavs[:3]

(186,
 [PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/han.todd.170621.wav'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/ares_spn_230217_203.wav'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/ares_ant_010317_33.wav')])

In [10]:
matfiles = list(DSLOC.glob("*.mat"))
len(matfiles), matfiles[:3]

(82,
 [PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/apollo_angel_140217.mat'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/jasmine.hermes.170622.mat'),
  PosixPath('/mnt/cube/Datasets/Marmosets/FromMillerLab/aladdin_banana_060317.mat')])

### Parse data into dataframe

In [11]:
import re

def parse_marmoset_data(wavs, _filetype="wav"):
    """Parse filename of marmoset data into a pandas dataframe
        
    Arguments:
        wavs {[type]} -- [description]
    
    Keyword Arguments:
        _filetype {str} -- [description] (default: {"wav"})
    
    Returns:
        [type] -- [description]
    """
    wav_df = pd.DataFrame(
        columns=["monkey1", "monkey2", "date", "date_idx", _filetype + "_loc"]
    )
    for _wav in wavs:
        if _wav.stem[0] == ".":
            continue
        monkey1 = None
        date = None
        monkey2 = None
        date_idx = None

        wav_split = _wav.stem.split("_")
        if len(wav_split) == 3:
            monkey1, monkey2, date = wav_split
        elif len(wav_split) == 4:
            monkey1, monkey2, date, date_idx = wav_split
        elif len(wav_split) == 1:
            if len(_wav.stem.split(".")) == 3:
                monkey1, monkey2, date = _wav.stem.split(".")
            elif len(_wav.stem.split(".")) == 2:
                monkey1, date_idx = _wav.stem.split(".")
            elif len(_wav.stem.split(".")) == 4:
                monkey1, date_idx, date, _ = _wav.stem.split(".")
            elif len(re.findall("[A-Z][^A-Z]*", _wav.stem)) == 2:
                monkey1, date_idx = re.findall("[A-Z][^A-Z]*", _wav.stem)
            else:
                continue

        wav_df.loc[len(wav_df)] = [monkey1, monkey2, date, date_idx, _wav]
    return wav_df


In [12]:
wav_df = parse_marmoset_data(wavs, _filetype = "wav")
print(len(wav_df))
display(wav_df[:3])

183


Unnamed: 0,monkey1,monkey2,date,date_idx,wav_loc
0,han,todd,170621,,/mnt/cube/Datasets/Marmosets/FromMillerLab/han...
1,ares,spn,230217,203.0,/mnt/cube/Datasets/Marmosets/FromMillerLab/are...
2,ares,ant,10317,33.0,/mnt/cube/Datasets/Marmosets/FromMillerLab/are...


In [13]:
mf_df = parse_marmoset_data(matfiles, _filetype = "mat")
print(len(mf_df))
display(mf_df[:3])

81


Unnamed: 0,monkey1,monkey2,date,date_idx,mat_loc
0,apollo,angel,140217,,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...
1,jasmine,hermes,170622,,/mnt/cube/Datasets/Marmosets/FromMillerLab/jas...
2,aladdin,banana,60317,,/mnt/cube/Datasets/Marmosets/FromMillerLab/ala...


In [14]:
# merge dataframes
mf_df = pd.merge(
    mf_df,
    wav_df,
    how="left",
    left_on=["monkey1", "monkey2", "date", "date_idx"],
    right_on=["monkey1", "monkey2", "date", "date_idx"],
    suffixes=(False, False),
)
# remove unlabelled wavs
mf_df = mf_df[mf_df.wav_loc.isnull() == False]
print(len(mf_df))
display(mf_df[:3])

80


Unnamed: 0,monkey1,monkey2,date,date_idx,mat_loc,wav_loc
0,apollo,angel,140217,,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...
1,jasmine,hermes,170622,,/mnt/cube/Datasets/Marmosets/FromMillerLab/jas...,/mnt/cube/Datasets/Marmosets/FromMillerLab/jas...
2,aladdin,banana,60317,,/mnt/cube/Datasets/Marmosets/FromMillerLab/ala...,/mnt/cube/Datasets/Marmosets/FromMillerLab/ala...


### Parse matfiles into syllables

In [15]:
from scipy.io import loadmat

def parse_marmoset_calls(row, callers=["monkey1_data", "monkey2_data"]):
    """ Parses a .mat file of marmoset vocalizations into a dataframe
        
    Arguments:
        row {[type]} -- [description]
    
    Keyword Arguments:
        callers {list} -- [description] (default: {["monkey1_data", "monkey2_data"]})
    
    Returns:
        [type] -- [description]
    """
    # load the annotations
    annotations = loadmat(row.mat_loc.as_posix())
    # create syllable_df
    syllable_df = pd.DataFrame(
        columns=[
            "indv",
            "partner",
            "date",
            "call_type",
            "wav_loc",
            "call_num",
            "pulse_n",
            "pulse_start",
            "pulse_end",
        ]
    )
    for caller in callers:
        # determine partner vs indv.
        indv = row.monkey1 if caller == "monkey1_data" else row.monkey2
        partner = row.monkey2 if caller == "monkey1_data" else row.monkey1
        for call_ix, call in enumerate(annotations[caller]):
            # this list goes [start1, end1, start2, end2]
            n_subcalls = int(len(call[1]) / 2)
            call_name = call[0][0]  # e.g. "phee"
            for call_sub in range(n_subcalls):
                subcall_start = call[1][call_sub * 2]
                subcall_end = call[1][(call_sub * 2) + 1]
                # if this call is too long, its probably a mistake
                if ((subcall_end - subcall_start) > 5) or (
                    (subcall_end - subcall_start) <= 0
                ):
                    continue
                syllable_df.loc[len(syllable_df)] = [
                    indv,
                    partner,
                    row.date,
                    call_name,
                    row.wav_loc,
                    call_ix,
                    call_sub,
                    subcall_start[0],
                    subcall_end[0],
                ]
    return syllable_df


In [16]:
syllable_df = pd.concat(
    Parallel(n_jobs=-1, verbose=10)(
        delayed(parse_marmoset_calls)(row)
        for idx, row in tqdm(mf_df.iterrows(), total=len(mf_df))
    )
)

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.4s





[Parallel(n_jobs=-1)]: Done  42 out of  80 | elapsed:    4.3s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  51 out of  80 | elapsed:    4.9s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done  60 out of  80 | elapsed:    5.3s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  69 out of  80 | elapsed:    5.8s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  78 out of  80 | elapsed:    7.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    7.6s finished


In [17]:
print(len(syllable_df))
display(syllable_df[:3])

14295


Unnamed: 0,indv,partner,date,call_type,wav_loc,call_num,pulse_n,pulse_start,pulse_end
0,apollo,angel,140217,phee,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,0,0,14.038007,16.171723
1,apollo,angel,140217,phee,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,1,0,107.359792,108.729595
2,apollo,angel,140217,phee,/mnt/cube/Datasets/Marmosets/FromMillerLab/apo...,1,1,109.060383,110.417463


### segment WAVs into 'bouts'
- There are a lot of periods of time in the original datasets that are not occupied by any vocalizations. Here, we segment out those time periods and create new sub-WAVs. For each sub-WAV, we generate a JSON with metadata and segment information. 

In [18]:
from avgn.utils.json import NoIndent, NoIndentEncoder

In [19]:
def segment_wav_into_bouts(wav_df, hparams):
    """ Segments the wav_df full of segmental information into individual bouts
    """
    # populate a list of dataframes corresponding to each bout
    bout_dfs = []
    # first bout starts at first voc
    bout_start = wav_df.iloc[0].pulse_start
    for ri, (idx, row) in enumerate(wav_df.iterrows()):

        # if this is the last voc, it should be the end of the bout
        if ri == len(wav_df) - 1:
            bout_end = row.pulse_end
        # if there is not a gap greater than bout_segmentation_min_s after this voc its part of the same voc
        if ri == len(wav_df) - 1:
            bout_end = row.pulse_end
        else:
            if (
                wav_df.iloc[ri + 1].pulse_start - row.pulse_end
                > hparams.bout_segmentation_min_s
            ):
                bout_end = row.pulse_end
            else:
                continue

        # create a dataframe of only the bout
        bout_df = wav_df[
            (wav_df.pulse_start >= bout_start) & (wav_df.pulse_end <= bout_end)
        ]
        bout_dfs.append(bout_df)
        
        # set next bout start
        if ri < len(wav_df)-1:
            bout_start = wav_df.iloc[ri + 1].pulse_start
        
    return bout_dfs


def load_bout_data(bout_df, wav_df, hparams):
    """ Loads data for marmoset bout given a bout dataframe
    """
    bout_start = bout_df.pulse_start.values[0]
    bout_end = bout_df.pulse_end.values[-1]
    # Ensure padding does not start before WAV starts
    bout_pad_start = hparams.bout_pad_s
    if bout_start - hparams.bout_pad_s < 0:
        bout_pad_start = hparams.bout_pad_s - bout_start

    # load the wav at the relevant times + padding if possible
    clip_duration = (bout_end + hparams.bout_pad_s) - (bout_start - bout_pad_start)
    bout_wav, sr = librosa.load(
        bout_df.iloc[0].wav_loc,
        mono=True,
        sr=None,
        offset=bout_start - bout_pad_start,
        duration=clip_duration,
    )
    # extract a noise clip
    if hparams.get_noise_clip:
        bout_noise, noise_sr = avgn.custom_parsing.general.extract_noise_clip(
            bout_df.iloc[0].wav_loc,
            bout_start,
            bout_end,
            wav_df.pulse_start.values,
            wav_df.pulse_end.values,
            hparams.min_noise_clip_size_s,
            hparams.max_noise_clip_size_s,
        )
    else:
        bout_noise = None
        noise_sr = None
    return bout_wav, sr, bout_noise, noise_sr, bout_start, bout_pad_start


def generate_json(bout_df, bout_number, bout_len, sr, bout_start, bout_pad_start):
    """ Generates a json from 
    """
    wavdate = datetime.strptime(bout_df.date.values[0], "%d%m%y")
    wav_date = wavdate.strftime("%Y-%m-%d_%H-%M-%S")

    # wav general information
    json_dict = {}
    json_dict["bout_number"] = bout_number
    json_dict["datetime"] = wav_date
    json_dict["samplerate_hz"] = sr
    json_dict["original_wav"] = bout_df.wav_loc.values[0].as_posix()
    json_dict["length_s"] = bout_len
    json_dict["time_relative_to_original_wav"] = bout_start - bout_pad_start
    json_dict["indvs"] = {}
    json_dict

    # individual specific information
    for indv in bout_df.indv.unique():
        json_dict["indvs"][indv] = {}
        indv_df = bout_df[bout_df.indv == indv].sort_values(by="pulse_start")
        json_dict["indvs"][indv]["partner"] = indv_df.partner.values[0]
        json_dict["indvs"][indv]["calls"] = {
            "start_times": NoIndent(
                list(indv_df.pulse_start.values - bout_start + bout_pad_start)
            ),
            "end_times": NoIndent(
                list(indv_df.pulse_end.values - bout_start + bout_pad_start)
            ),
            "labels": NoIndent(list(indv_df.call_type.values)),
            "call_num": NoIndent(list(indv_df.call_num.values)),
            "pulse_num": NoIndent(list(indv_df.pulse_n.values)),
        }

    json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)
    return json_txt


def save_bout_data(DATA_DIR, json_txt, bout_wav, sr, bout_noise, noise_sr, dataset_id, DT_ID, wav_stem, bout_start, bout_pad_start):

    # get time of bout relative to wav
    time_in_wav = bout_start - bout_pad_start
    bout_start_string = avgn.utils.general.seconds_to_str(time_in_wav)

    # output locations
    wav_out = (
        DATA_DIR
        / "processed"
        / dataset_id
        / DT_ID
        / "WAV"
        / (wav_stem + "__" + bout_start_string + ".WAV")
    )
    json_out = (
        DATA_DIR
        / "processed"
        / dataset_id
        / DT_ID
        / "JSON"
        / (wav_stem + "__" + bout_start_string + ".JSON")
    )

    # save wav file
    avgn.utils.paths.ensure_dir(wav_out)
    librosa.output.write_wav(wav_out, y=bout_wav, sr=sr, norm=True)

    # save json
    avgn.utils.paths.ensure_dir(json_out.as_posix())
    print(json_txt, file=open(json_out.as_posix(), "w"))
    
    # save noise file
    if hparams.get_noise_clip:
        noise_out = (
            DATA_DIR
            / "processed"
            / dataset_id
            / DT_ID
            / "NOISE"
            / (wav_stem + "__" + bout_start_string + ".WAV")
        )
        avgn.utils.paths.ensure_dir(noise_out)
        if bout_noise is not None:
            librosa.output.write_wav(noise_out, y=bout_noise, sr=noise_sr, norm=True)

In [20]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-06-17_21-13-02'

In [21]:
# HParams is just a python object storing a set of hyperparameters.
hparams = avgn.utils.general.HParams(
    bout_segmentation_min_s = 30,  # Minimum amount of seconds between vocal activity required to split a wavfile
    bout_pad_s = 5, # how much time to pad this bout with on either side
    # noise clip
    get_noise_clip = True, # if a noise clip preceding the vocalization should be grabbed to help reduce noise in analysis
    max_noise_clip_size_s = 10, # how large the noise clip can be
    min_noise_clip_size_s = 1, # how small the noise clip can be
    
)

In [22]:
bout_dfs =  Parallel(n_jobs=-1, verbose=10)(
        delayed(segment_wav_into_bouts)(
            (
                syllable_df[syllable_df.wav_loc == wav_loc]
                .sort_values(by=["pulse_start"])
                .reset_index()
            ),
            hparams,
        )
        for wav_loc in tqdm(syllable_df.wav_loc.unique())
    )
bout_dfs = [item for sublist in bout_dfs for item in sublist]

HBox(children=(IntProgress(value=0, max=78), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.7s





[Parallel(n_jobs=-1)]: Done  39 out of  78 | elapsed:    3.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  47 out of  78 | elapsed:    3.0s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  55 out of  78 | elapsed:    3.1s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  63 out of  78 | elapsed:    3.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  71 out of  78 | elapsed:    3.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  78 out of  78 | elapsed:    3.3s finished


In [23]:
def segment_and_annotate_bouts(bout_number, wav_df, bout_df, hparams):
    """ segments parsed bouts and annotates as json
    """
    bout_wav, sr, bout_noise, noise_sr, bout_start, bout_pad_start = load_bout_data(bout_df, wav_df, hparams)
    bout_duration = len(bout_wav) / sr
    # generate the json for the bout
    json_txt = generate_json(bout_df, bout_number, bout_duration, sr, bout_start, bout_pad_start)
    # save bout WAV, Noise, and JSON
    wav_stem = bout_df.iloc[0].wav_loc.stem
    save_bout_data(DATA_DIR, json_txt, bout_wav, sr, bout_noise, noise_sr, dataset_id, DT_ID, wav_stem, bout_start, bout_pad_start)

In [25]:
Parallel(n_jobs=-1, verbose=10)(
    delayed(segment_and_annotate_bouts)(
        bout_number,
        syllable_df[syllable_df.wav_loc == bout_df.iloc[0].wav_loc]
        .sort_values(by=["pulse_start"])
        .reset_index(),
        bout_df,
        hparams,
    )
    for bout_number, bout_df in tqdm(enumerate(bout_dfs), total=len(bout_dfs))
)

HBox(children=(IntProgress(value=0, max=769), HTML(value='')))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:   57.3s
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:  

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,