### Mobysound whale custom parsing
- This is a series of datasets with whale/dolphin echolocation clicks. They are all available on Mobysound. 
    - WAV files for vocalization that contains labels for species and vocalization. 
- This notebook creates a JSON corresponding to each WAV file.
- Dataset origin:
    - https://www.mobysound.org/

In [1]:
from avgn.utils.general import prepare_env



In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np
import re

In [4]:
import avgn
from avgn.custom_parsing.picidae_woodpecker import generate_json
from avgn.utils.paths import DATA_DIR

In [5]:
DATASET_ID = "MOBYSOUND_WHALES"

### Load data in original format

In [6]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-09-30_16-07-06'

In [7]:
subfolders = {
        'Melon headed whale':{
            'folder': 'MellonHeaded_MTSTCS',
            'species': 'Peponocephala electra',
            'split': '.',
            'click_types' : [
                            ['good', '*Good.txt']
                        ]
        },
        'Sperm whale':{
            'folder': 'Sperm whales_Bahamas(AUTEC)-Annotated/',
            'species': 'Peponocephala electra',
            'split': '_',
            'click_types' : [
                            ['maybe', '*maybe.txt'],
                            ['good', '*short_good.txt']
                        ]
        },
        'Pilot whale':{
            'folder': 'Pilot_whales_Bahamas(AUTEC)-Annotated-NUWC',
            'species': 'Peponocephala electra',
            'split': '_',
            'click_types' : [
                                ['good', '*good.txt'],
                                ['maybe', '*maybe1.txt']
                        ]
        },
        'Risos whale':{
            'folder': 'Rissos-SCORE-annot',
            'species': 'Peponocephala electra',
            'click_types' : [
                            ['good', '*GoodSingle.txt'],
                            ['poor', '*PoorSingle.txt'],
                        ]
        },
        'Pacific whitesided dolphin':{
            'folder': 'PacWhitesidedDolphin',
            'split': '.',
            'species': 'Peponocephala electra',
            'click_types' : [
                            ['type1h', '*type 1h.txt.txt']
                        ]
        },
        'Blainsville\'s beaked whale':{
            'folder': 'Mesoplodon_Bahamas(AUTEC)-Annotated',
            'split': '.',
            'species': 'Peponocephala electra',
            'click_types' : [
                            ['good', '*/*on_final.log'],
                            ['maybe', '*/*maybe_final.log'],
                        ]
        }
}

In [8]:
DSLOC = avgn.utils.paths.Path('/mnt/cube/Datasets/mobysound/')
DSLOC

PosixPath('/mnt/cube/Datasets/mobysound')

### create JSON for each species

In [9]:
from avgn.utils.audio import get_samplerate
from avgn.utils.json import NoIndent, NoIndentEncoder

In [10]:
from IPython.display import display

In [30]:
# for each species
for common_name, species_dict in tqdm(subfolders.items()):
    # get the folder
    ds_folder = DSLOC / species_dict['folder']
    
    # create a df of wavs in that folder
    wavs = list(ds_folder.glob('*.wav'))
    wav_df = pd.DataFrame(columns = ['stem', 'path', 'rate', 'duration'])
    for wav in wavs:
        try:
            sr = get_samplerate(wav.as_posix())
        except:
            print(wav)
            continue
        wav_duration = librosa.get_duration(filename=wav)
        wav_df.loc[len(wav_df)] = [wav.stem, wav, sr, wav_duration]
    
    # create a dataframe of clicks for those wavs
    clicks_df = []
    for click_type, click_type_glob in species_dict['click_types']:
        good_clicks = list(ds_folder.glob(click_type_glob))
        good_clicks = [i for i in good_clicks if i.stem[0] != '.']
        for click_label_loc in good_clicks:
            good_clicks_df = pd.read_csv(click_label_loc, delimiter='\t')
            good_clicks_df['stem'] = click_label_loc.stem
            good_clicks_df['click_value'] = click_type
            clicks_df.append(good_clicks_df)
    if len(clicks_df) == 0:
        continue
    clicks_df = pd.concat(clicks_df)
    clicks_df.columns = [re.sub(' +', ' ', i) for i in clicks_df.columns]
    clicks_df['stem'] = [row.stem[:-(len(row.click_value)+1)] for idx, row in clicks_df.iterrows()]
    #clicks_df['stem'] = [i.split(species_dict['split'])[0] for i in clicks_df.stem]
        
    display(clicks_df[:3])
        
    # for each wav
    for idx, wavrow in tqdm(wav_df.iterrows()):
        wavclicks = clicks_df[clicks_df.stem == wavrow.stem]
        if len(wavclicks) == 0:
            print(wavrow.stem)
            continue
        if type(wavclicks[" end time"].values[0]) is np.ndarray:
            stop_times = list([i[0] for i in wavclicks[" end time"].values])
        else:
            stop_times = list(wavclicks[" end time"].values)
            
        start_times = list(wavclicks["% start time"].values)
        
        print(start_times[:3], stop_times[:3])

        
        if np.isnan(start_times[0]):
            start_times = list(wavclicks["%start time"].values)
              
        if np.isnan(stop_times[0]):
            stop_times = [i[0] for i in wavclicks.index.values]        
        
        print(start_times[:3], stop_times[:3])
        
        # make json
        json_dict = {}
        json_dict["indvs"] = {
            "UNK": {
                "clicks": {
                    "start_times": NoIndent(start_times),
                    "end_times": NoIndent(stop_times),
                #"peak_freq": NoIndent(list(wavclicks["peak freq"].values)),
                #"center_freq": NoIndent(list(wavclicks["centr. freq"].values)),
                "click_value": NoIndent(list(wavclicks["click_value"].values)),
                }
                
            }
        }
        
        # add species
        json_dict["species"] = species_dict['species']
        json_dict["common_name"] = common_name
        json_dict["wav_loc"] = wavrow.path.as_posix()
        json_dict["stem"] = wavrow.stem
        # rate and length
        json_dict["samplerate_hz"] = wavrow.rate
        json_dict["length_s"] = wavrow.duration

        # dump json
        json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)
        
        # json save location
        wav_stem = wavrow.stem
        json_out = (
            DATA_DIR
            / "processed"
            / DATASET_ID
            / DT_ID
            / "JSON"
            / (wav_stem + ".JSON")
        )
        
        # save json
        avgn.utils.paths.ensure_dir(json_out.as_posix())
        print(json_txt, file=open(json_out.as_posix(), "w"))
        #print(json_txt)


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,end time,low freq,high freq,peak freq,centr. freq,end time.1,high freq.1,% start time,%start time,Unnamed: 6,centr. freq.1,click_value,low freq.1,peak freq.1,stem
0,120.35451,11936.23522,47548.39651,88.32861,28750.62782,,,120.35314,,,,good,,,MISTCS070124-114000
1,120.43898,13613.61962,47290.33737,89.91411,27392.97215,,,120.4377,,,,good,,,MISTCS070124-114000
2,121.30407,10258.85081,47419.36694,84.14929,22236.85906,,,121.30293,,,,good,,,MISTCS070124-114000


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

[120.35314, 120.4377, 121.30293] [120.35451, 120.43898, 121.30407]
[nan, nan, nan] [nan, nan, nan]
MISTCS070124-113934
[400.81357, 401.21156, 405.2605] [400.81464, 401.2125, 405.26164]
MISTCS070124-111000


Unnamed: 0,% start time,end time,duration,low freq,high freq,Unnamed: 5,stem,click_value
0,0.009,0.012,0.003,4789.633,39380.461,,SpermWh_A2_030306-H12,maybe
1,0.025,0.029,0.004,4226.264,41633.935,,SpermWh_A2_030306-H12,maybe
2,0.041,0.046,0.004,4789.633,40507.198,,SpermWh_A2_030306-H12,maybe


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

[0.289, 0.894, 1.608] [0.311, 0.915, 1.631]
[0.009000000000000001, 0.025, 0.040999999999999995] [0.012, 0.028999999999999998, 0.046]


Unnamed: 0,% start time,end time,low freq,high freq,Unnamed: 4,stem,click_value
0,51.874,51.882,6294.029,45928.382,,Set3-A2-092605-H23-0615-0630-1450-1505loc,good
1,54.831,54.837,7779.34,45928.382,,Set3-A2-092605-H23-0615-0630-1450-1505loc,good
2,56.453,56.46,7388.468,45302.988,,Set3-A2-092605-H23-0615-0630-1450-1505loc,good


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

[0.057999999999999996, 0.361, 0.795] [0.063, 0.366, 0.8009999999999999]
[1.275, 1.494, 1.713] [1.2830000000000001, 1.5, 1.719]
[51.873999999999995, 54.831, 56.453] [51.882, 54.836999999999996, 56.46]
[2.613, 2.8169999999999997, 47.935] [2.622, 2.825, 47.942]
[0.055999999999999994, 0.295, 0.295] [0.06, 0.3, 0.298]
[3.7430000000000003, 4.502, 5.1160000000000005] [3.7489999999999997, 4.5089999999999995, 5.122999999999999]


Unnamed: 0,% start time,end time,low freq,high freq,Unnamed: 4,stem,click_value
0,4.26258,4.27124,21328.39072,40132.12268,,Set1-A2-H17-081406-0000-0030-1225-1255loc.GoodS,good
1,4.39858,4.40724,21427.35773,40033.15567,,Set1-A2-H17-081406-0000-0030-1225-1255loc.GoodS,good
2,4.57381,4.584,21526.32474,41913.52887,,Set1-A2-H17-081406-0000-0030-1225-1255loc.GoodS,good


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Set1-A2-H17-081406-0000-0030-1225-1255loc
/mnt/cube/Datasets/mobysound/PacWhitesidedDolphin/._Lo-070421-164044-Lo-B16h40m44s21apr2007y.wav


Unnamed: 0,184.422 184.423 23944.954 43211.009,189.550 189.550 26366.972 35724.771,click_value,stem
0,184.641 184.642 24385.321 44532.110,,good,Set3_A2_042705_CH1_H12_A0500-0530.mesoplodon_
1,184.866 184.866 24165.138 40788.991,,good,Set3_A2_042705_CH1_H12_A0500-0530.mesoplodon_
2,185.101 185.102 22844.037 44972.477,,good,Set3_A2_042705_CH1_H12_A0500-0530.mesoplodon_


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [26]:
wavclicks

Unnamed: 0,end time,low freq,high freq,peak freq,centr. freq,end time.1,high freq.1,% start time,%start time,Unnamed: 6,centr. freq.1,click_value,low freq.1,peak freq.1,stem
"(41.88, nan)",,,,,,,47548.397,,41.882,,19704.276,good,6258.934,8578.125,MISTCS070124-112000
"(41.897, nan)",,,,,,,47559.064,,41.898,,24827.02,good,9512.613,10117.647,MISTCS070124-112000
"(41.917, nan)",,,,,,,47548.397,,41.919,,23733.798,good,10516.91,13072.5,MISTCS070124-112000
"(41.943000000000005, nan)",,,,,,,47307.101,,41.945,,20261.269,good,6866.999,8590.909,MISTCS070124-112000
"(41.949, nan)",,,,,,,47290.337,,41.951,,23896.944,good,7678.259,7868.056,MISTCS070124-112000
"(41.953, nan)",,,,,,,47548.397,,41.957,,22803.348,good,8323.407,11561.321,MISTCS070124-112000
"(41.955, nan)",,,,,,,47181.119,,41.956,,25892.16,good,9386.631,10852.941,MISTCS070124-112000
"(41.956, nan)",,,,,,,47433.083,,41.957,,27982.309,good,11150.374,13821.429,MISTCS070124-112000
"(42.059, nan)",,,,,,,47559.064,,42.061,,27351.206,good,12158.227,13101.563,MISTCS070124-112000
"(99.027, nan)",,,,,,,47290.337,,99.028,,24713.391,good,11291.087,18964.286,MISTCS070124-112000
