In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd
import collections



In [3]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [4]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet 

In [5]:
from avgn.dataset_names import species_dict

In [6]:
list(species_dict.keys())

['castellucci_mouse_usv_segmented',
 'BIRD_DB_Vireo_cassinii',
 'gibbon_morita_segmented',
 'bengalese_finch_sober',
 'buckeye',
 'swamp_sparrow',
 'mobysound_humpback_whale',
 'koumura_bengalese_finch',
 'batsong_segmented',
 'giant_otter',
 'BIRD_DB_Toxostoma_redivivum',
 'zebra_finch_gardner_segmented',
 'katahira_white_munia_segmented',
 'european_starling_gentner_segmented',
 'macaque_coo',
 'zebra_finch_theunisson',
 'marmoset',
 'hildebrand_Cuviers_beaked_whale',
 'hildebrand_Gervaiss_beaked_whale',
 'NA_BIRDS_american_crow',
 'NA_BIRDS_american_yellow_warbler',
 'NA_BIRDS_blue_jay',
 'NA_BIRDS_cedar_waxwing',
 'NA_BIRDS_chipping_sparrow',
 'NA_BIRDS_common_yellowthroat',
 'NA_BIRDS_great_blue_heron',
 'NA_BIRDS_house_finch',
 'NA_BIRDS_indigo_bunting',
 'NA_BIRDS_marsh_wren',
 'NA_BIRDS_song_sparrow',
 'canary_segmented']

In [7]:
DATASET_IDs = list(species_dict.keys())

In [8]:
hparams = HParams()

In [9]:
def get_call_types(sample_json):
    call_types = []
    for ct in sample_json["indvs"][
        list(sample_json["indvs"].keys())[0]
    ].keys():
        if (
            type(
                sample_json["indvs"][list(sample_json["indvs"].keys())[0]][
                    ct
                ]
            )
            == collections.OrderedDict
        ):
            if "start_times" in list(
                sample_json["indvs"][list(sample_json["indvs"].keys())[0]][
                    ct
                ].keys()
            ):
                call_types.append(ct)
    return call_types

In [10]:
import librosa

In [11]:
dataset_df = pd.DataFrame(
    columns=[
        "DatasetID",
        "individuals",
        "#individuals",
        "#units",
        "total_length_s",
        "wav_lengths",
        "unit_types",
        "#wavs",
        "element_lengths",
        "median_element_lengths",
        "mean_element_lengths",
    ]
)
for dataset_id in tqdm(DATASET_IDs[::-1], desc="datasets"):
    # create a dataset object
    dataset = DataSet(dataset_id, hparams=hparams, build_mel_matrix=False)

    # get call types in dataset
    call_types = get_call_types(dataset.sample_json)

    # get unique individuals
    unique_indvs = np.unique(np.concatenate([list(i) for i in dataset.json_indv]))

    # length of each wav file
    wav_lengths = []
    # number of calls
    n_calls = {i: 0 for i in call_types}
    element_lengths = {i: [] for i in call_types}
    # loop through data files
    for jf in tqdm(dataset.data_files, leave=False):

        # load data
        dat = dataset.data_files[jf].data

        # length of wav file
        if "length_s" in dat.keys():
            if dataset_id in [
                "hildebrand_Cuviers_beaked_whale",
                "hildebrand_Gervaiss_beaked_whale",
            ]:
                data, rate = librosa.core.load(dat["wav_loc"])
                wav_lengths.append(len(data) / rate)
            else:
                wav_lengths.append(dat["length_s"])
        else:
            data, rate = librosa.core.load(dat["wav_loc"])
            wav_lengths.append(len(data) / rate)

        # get number of calls in this file
        for indv in list(dat["indvs"].keys()):
            for ct in call_types:
                if ct in list(dat["indvs"][indv]):
                    st = dat["indvs"][indv][ct]["start_times"]
                    et = dat["indvs"][indv][ct]["end_times"]
                    n_calls[ct] += len(dat["indvs"][indv][ct]["start_times"])
                    element_lengths[ct].append([i - j for i, j in zip(et, st)])

    median_element_lengths = {
        i: np.median(np.concatenate(element_lengths[i])) for i in element_lengths.keys()
    }
    mean_element_lengths = {
        i: np.mean(np.concatenate(element_lengths[i])) for i in element_lengths.keys()
    }

    dataset_df.loc[len(dataset_df)] = [
        dataset_id,
        unique_indvs,
        len(unique_indvs),
        n_calls,
        np.sum(wav_lengths),
        wav_lengths,
        call_types,
        len(dataset.data_files),
        element_lengths,
        median_element_lengths,
        mean_element_lengths,
    ]
    #break

HBox(children=(IntProgress(value=0, description='datasets', max=31, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='loading json', max=2320, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 535 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 2320 out of 2320 | elapsed:    5.7s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=2320, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=2320), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=258, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 258 out of 258 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=258, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=258), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=248, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 248 out of 248 | elapsed:    0.0s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=248, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=248), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=251, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 251 out of 251 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=251, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=251), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=248, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 248 out of 248 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=248, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=248), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=246, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 246 out of 246 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=246, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=246), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=255, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 255 out of 255 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=255, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=255), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=252, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 252 out of 252 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=252, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=252), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=245, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 245 out of 245 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=245, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=245), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=250, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=250, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=246, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 246 out of 246 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=246, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=246), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=252, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 252 out of 252 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=252, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=252), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=1936, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 1936 out of 1936 | elapsed:    0.4s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=1936, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=1936), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=2237, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 2237 out of 2237 | elapsed:    0.3s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=2237, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=2237), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=768, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 768 out of 768 | elapsed:    0.3s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=768, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=768), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=3347, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 3347 out of 3347 | elapsed:    0.6s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=3347, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=3347), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=7284, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 7284 out of 7284 | elapsed:    1.0s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=7284, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=7284), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=3805, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 3805 out of 3805 | elapsed:    0.7s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=3805, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=3805), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=169, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 169 out of 169 | elapsed:    0.2s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=169, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=169), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=18028, style=ProgressStyle(description_wid…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 5664 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 18028 out of 18028 | elapsed:    2.4s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=18028, style=ProgressStyle(d…

HBox(children=(IntProgress(value=0, max=18028), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=92, style=ProgressStyle(description_width=…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  92 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  92 out of  92 | elapsed:    0.2s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=92, style=ProgressStyle(desc…

HBox(children=(IntProgress(value=0, max=92), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=452, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 452 out of 452 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=452, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=452), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=83823, style=ProgressStyle(description_wid…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 25020 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 68520 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 83823 out of 83823 | elapsed:   10.0s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=83823, style=ProgressStyle(d…

HBox(children=(IntProgress(value=0, max=83823), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=2964, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 2964 out of 2964 | elapsed:    0.5s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=2964, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=2964), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=13, style=ProgressStyle(description_width=…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  13 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    0.0s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=13, style=ProgressStyle(desc…

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=1867, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 1867 out of 1867 | elapsed:    0.5s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=1867, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=1867), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=254, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 254 out of 254 | elapsed:    2.5s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=254, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=254), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=2663, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2663 out of 2663 | elapsed:    0.5s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=2663, style=ProgressStyle(de…

HBox(children=(IntProgress(value=0, max=2663), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=128, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=128, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=128), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=422, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 422 out of 422 | elapsed:    0.2s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=422, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=422), HTML(value='')))

HBox(children=(IntProgress(value=0, description='loading json', max=133, style=ProgressStyle(description_width…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 133 out of 133 | elapsed:    0.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=133, style=ProgressStyle(des…

HBox(children=(IntProgress(value=0, max=133), HTML(value='')))




In [12]:
dataset_df

Unnamed: 0,DatasetID,individuals,#individuals,#units,total_length_s,wav_lengths,unit_types,#wavs,element_lengths,median_element_lengths,mean_element_lengths
0,canary_segmented,"[bird1, bird2, bird3, bird4, bird6]",5,"{'phrases': 22167, 'syllables': 497338}",36986.856042,"[13.502916666666666, 7.711291666666667, 9.2793...","[phrases, syllables]",2320,"{'phrases': [[0.5822916666666667, 1.0129166666...","{'phrases': 1.3189583333333328, 'syllables': 0...","{'phrases': 1.6103567118548243, 'syllables': 0..."
1,NA_BIRDS_song_sparrow,[UNK],1,{'syllables': 258},32.803062,"[0.07503125, 0.13003125, 0.08503125, 0.1550312...",[syllables],258,"{'syllables': [[0.07503125], [0.13003125], [0....",{'syllables': 0.10503125},{'syllables': 0.1271436531007752}
2,NA_BIRDS_marsh_wren,[UNK],1,{'syllables': 248},23.80775,"[0.05503125, 0.13503125, 0.12003125, 0.1050312...",[syllables],248,"{'syllables': [[0.05503125], [0.13503125], [0....",{'syllables': 0.09003125},{'syllables': 0.09599899193548388}
3,NA_BIRDS_indigo_bunting,[UNK],1,{'syllables': 251},35.977844,"[0.21003125, 0.07003125, 0.23003125, 0.1150312...",[syllables],251,"{'syllables': [[0.21003125], [0.07003125], [0....",{'syllables': 0.13503125},{'syllables': 0.14333802290836653}
4,NA_BIRDS_house_finch,[UNK],1,{'syllables': 248},25.93775,"[0.20503125, 0.07003125, 0.09503125, 0.1100312...",[syllables],248,"{'syllables': [[0.20503125], [0.07003125], [0....",{'syllables': 0.09253125},{'syllables': 0.10458770161290323}
5,NA_BIRDS_great_blue_heron,[UNK],1,{'syllables': 246},44.092688,"[0.28503125, 0.06503125, 0.34503125, 0.2050312...",[syllables],246,"{'syllables': [[0.28503125], [0.06503125], [0....",{'syllables': 0.13753125},{'syllables': 0.17923856707317073}
6,NA_BIRDS_common_yellowthroat,[UNK],1,{'syllables': 255},35.422187,"[0.16503125, 0.04503125, 0.09503125, 0.1000312...",[syllables],255,"{'syllables': [[0.16503125], [0.04503125], [0....",{'syllables': 0.10003125},{'syllables': 0.13891053921568627}
7,NA_BIRDS_chipping_sparrow,[UNK],1,{'syllables': 252},24.942875,"[0.09503125, 0.45003125, 0.04503125, 0.1050312...",[syllables],252,"{'syllables': [[0.09503125], [0.45003125], [0....",{'syllables': 0.09003125},{'syllables': 0.09897966269841268}
8,NA_BIRDS_cedar_waxwing,[UNK],1,{'syllables': 245},115.972531,"[0.55003125, 0.90003125, 0.17003125, 0.3400312...",[syllables],245,"{'syllables': [[0.55003125], [0.90003125], [0....",{'syllables': 0.42503125},{'syllables': 0.4733572704081633}
9,NA_BIRDS_blue_jay,[UNK],1,{'syllables': 250},141.217781,"[0.55003125, 0.41503125, 0.34503125, 0.3950312...",[syllables],250,"{'syllables': [[0.55003125], [0.41503125], [0....",{'syllables': 0.47003125},{'syllables': 0.5648711249999999}


In [13]:
dat

OrderedDict([('datetime', '2016-08-05_00-00-00'),
             ('samplerate_hz', 250000),
             ('length_s', 241.958912),
             ('wav_loc',
              '/mnt/cube/Datasets/mouse_usv/VOC591/VOC591_Isolation_Call_CMPA_8_5_2016_10_4.17.WAV'),
             ('age', '10'),
             ('FemaleMouse', 'Isolation'),
             ('call_type', 'Call'),
             ('weight', '4.17'),
             ('indvs',
              OrderedDict([('VOC591',
                            OrderedDict([('syllables',
                                          OrderedDict([('start_times',
                                                        [2.33,
                                                         2.522,
                                                         2.756,
                                                         12.552,
                                                         12.76,
                                                         12.972,
                                

In [14]:
dataset_id

'castellucci_mouse_usv_segmented'

In [15]:
np.sum(dataset_df.total_length_s.values) / 60/60

399.5625881307327

In [16]:
dataset_df['#units'].values

array([{'phrases': 22167, 'syllables': 497338}, {'syllables': 258},
       {'syllables': 248}, {'syllables': 251}, {'syllables': 248},
       {'syllables': 246}, {'syllables': 255}, {'syllables': 252},
       {'syllables': 245}, {'syllables': 250}, {'syllables': 246},
       {'syllables': 252}, {'clicks': 1936}, {'clicks': 2237},
       {'calls': 14289}, {'elements': 3347}, {'coos': 7284},
       {'syllables': 164230}, {'syllables': 109851},
       {'motifs': 18028, 'syllables': 65892}, {'syllables': 15328},
       {'syllables': 452}, {'syllables': 423043}, {'notes': 214915},
       {'syllables': 2006}, {'elements': 97513},
       {'words': 283721, 'phones': 837896}, {'syllables': 215480},
       {'syllables': 10333}, {'syllables': 67316}, {'syllables': 34124}],
      dtype=object)

In [36]:
np.sum(np.concatenate([[i[k] for k in i.keys()] for i in dataset_df['#units'].values]))

3111477

In [17]:
dataset_df.to_pickle(DATA_DIR / 'dataset_statistics.pickle')

In [18]:
dataset_df[:3]

Unnamed: 0,DatasetID,individuals,#individuals,#units,total_length_s,wav_lengths,unit_types,#wavs,element_lengths,median_element_lengths,mean_element_lengths
0,canary_segmented,"[bird1, bird2, bird3, bird4, bird6]",5,"{'phrases': 22167, 'syllables': 497338}",36986.856042,"[13.502916666666666, 7.711291666666667, 9.2793...","[phrases, syllables]",2320,"{'phrases': [[0.5822916666666667, 1.0129166666...","{'phrases': 1.3189583333333328, 'syllables': 0...","{'phrases': 1.6103567118548243, 'syllables': 0..."
1,NA_BIRDS_song_sparrow,[UNK],1,{'syllables': 258},32.803062,"[0.07503125, 0.13003125, 0.08503125, 0.1550312...",[syllables],258,"{'syllables': [[0.07503125], [0.13003125], [0....",{'syllables': 0.10503125},{'syllables': 0.1271436531007752}
2,NA_BIRDS_marsh_wren,[UNK],1,{'syllables': 248},23.80775,"[0.05503125, 0.13503125, 0.12003125, 0.1050312...",[syllables],248,"{'syllables': [[0.05503125], [0.13503125], [0....",{'syllables': 0.09003125},{'syllables': 0.09599899193548388}


In [19]:
for i in dataset_df.DatasetID.values:
    for i in species_dict[i]['reference'].split(', '):
        0

In [29]:
dataset_df["Species"] = [species_dict[i]["species"] for i in dataset_df.DatasetID]
dataset_df["# Indv."] = [
    str(i) if i != 1 else "Unk." for i in dataset_df["#individuals"]
]
dataset_df["# Elements"] = [
    "makecell"
    + " newline ".join([k + ": " + str(i[k]) for k in list(i.keys())])
    + "END"
    for i in dataset_df["#units"].values
]
dataset_df["Median len. (s)"] = [
    "makecell"
    + " newline ".join(
        [k + ": " + str(round(i[k], 3)).zfill(3) for k in list(i.keys())]
    )
    + "END"
    for i in dataset_df["median_element_lengths"].values
]
dataset_df["Total length (s)"] = round(dataset_df.total_length_s, 1)
dataset_df["# Rec."] = dataset_df["#wavs"]
dataset_df["References"] = [
    "REF"
    + ", ".join([i for i in species_dict[i]["reference"].split(", ")])
    + "END"
    for i in dataset_df.DatasetID.values
]

In [30]:
dataset_df[["Species",
            "# Indv.",
            "# Elements",
            "Median len. (s)",
            "Total length (s)",
            "# Rec.",
            "References"]]['# Elements'].values[0]

'makecellphrases: 22167 newline syllables: 497338END'

In [31]:
dataset_df[["Species",
            "# Indv.",
            "# Elements",
            "Median len. (s)",
            "Total length (s)",
            "# Rec.",
            "References"]]

Unnamed: 0,Species,# Indv.,# Elements,Median len. (s),Total length (s),# Rec.,References
0,Canary,5,makecellphrases: 22167 newline syllables: 497338END,makecellphrases: 1.319 newline syllables: 0.04END,36986.9,2320,REFmarkowitz2013longEND
1,Song sparrow,Unk.,makecellsyllables: 258END,makecellsyllables: 0.105END,32.8,258,"REFzhao2017automated, zhaozhao20181250690END"
2,Marsh wren,Unk.,makecellsyllables: 248END,makecellsyllables: 0.09END,23.8,248,"REFzhao2017automated, zhaozhao20181250690END"
3,Indigo bunting,Unk.,makecellsyllables: 251END,makecellsyllables: 0.135END,36.0,251,"REFzhao2017automated, zhaozhao20181250690END"
4,House finch,Unk.,makecellsyllables: 248END,makecellsyllables: 0.093END,25.9,248,"REFzhao2017automated, zhaozhao20181250690END"
5,Great blue heron,Unk.,makecellsyllables: 246END,makecellsyllables: 0.138END,44.1,246,"REFzhao2017automated, zhaozhao20181250690END"
6,Common yellowthroat,Unk.,makecellsyllables: 255END,makecellsyllables: 0.1END,35.4,255,"REFzhao2017automated, zhaozhao20181250690END"
7,Chipping sparrow,Unk.,makecellsyllables: 252END,makecellsyllables: 0.09END,24.9,252,"REFzhao2017automated, zhaozhao20181250690END"
8,Cedar waxwind,Unk.,makecellsyllables: 245END,makecellsyllables: 0.425END,116.0,245,"REFzhao2017automated, zhaozhao20181250690END"
9,Blue jay,Unk.,makecellsyllables: 250END,makecellsyllables: 0.47END,141.2,250,"REFzhao2017automated, zhaozhao20181250690END"


In [32]:
pd.set_option('display.max_colwidth', -1)

In [33]:
print(
    dataset_df[
        [
            "Species",
            "# Indv.",
            "# Elements",
            "Median len. (s)",
            "Total length (s)",
            "# Rec.",
            "References",
        ]
    ].sort_values(by='Species')
    .to_latex(index=False, bold_rows=True)
    .replace("makecell", "\makecell{")
    .replace("END", "}")
    .replace("newline", "\\\\")
    .replace('REF', '\cite{')
)

\begin{tabular}{llllrll}
\toprule
                Species & \# Indv. &                                           \# Elements &                                    Median len. (s) &  Total length (s) & \# Rec. &                                              References \\
\midrule
 American crow &  Unk. &  \makecell{syllables: 252} &  \makecell{syllables: 0.37} &  100.5 &  252 &  \cite{zhao2017automated, zhaozhao20181250690} \\
 Bengalese finch &  4 &  \makecell{syllables: 215480} &  \makecell{syllables: 0.065} &  40205.6 &  2663 &  \cite{Nicholson2017} \\
 Bengalese finch &  11 &  \makecell{notes: 214915} &  \makecell{notes: 0.089} &  35365.9 &  2964 &  \cite{koumura2016automatic, Koumura2016} \\
 Blue jay &  Unk. &  \makecell{syllables: 250} &  \makecell{syllables: 0.47} &  141.2 &  250 &  \cite{zhao2017automated, zhaozhao20181250690} \\
 California thrasher &  18 &  \makecell{syllables: 15328} &  \makecell{syllables: 0.146} &  19958.9 &  92 &  \cite{cody2016structure, arriaga2015bird} \