# In this notebook, we segment the waveform into putative syllables
When a dataset is not pre-segmented into individiual vocal units, we can try to segment it computationally. Here we'll use dynamic thresholding segmentation to segment bouts into syllables.

You'll need to install the [vocalseg](https://github.com/timsainb/vocalization-segmentation) package to use this. 

In [1]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet



In [2]:
DATASET_ID = 'koumura_bengalese_finch'

In [3]:
# create a set of hyperparameters for processing this dataset.  
hparams = HParams(
    num_mel_bins = 64,
    mel_lower_edge_hertz=500,
    mel_upper_edge_hertz=15000,
    butter_lowcut = 500,
    butter_highcut = 15000,
    ref_level_db = 20,
    min_level_db = -30,
    mask_spec = True,
    win_length_ms = 10,
    hop_length_ms = 2,
    nex=-1,
    n_jobs=-1,
    verbosity = 1,
)

### Create a dataset object
The dataset object loads JSONs corresponding to `DATASET_ID` in the data folder. 

In [4]:
# create a dataset object, which
dataset = DataSet(DATASET_ID, hparams = hparams)

HBox(children=(IntProgress(value=0, description='loading json', max=2964, style=ProgressStyle(description_widt…

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    7.0s





[Parallel(n_jobs=-1)]: Done 1600 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 2720 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 2964 out of 2964 | elapsed:    8.1s finished


HBox(children=(IntProgress(value=0, description='getting unique individuals', max=2964, style=ProgressStyle(de…



In [5]:
# to make sure everything loaded correctly, lets look at a sample JSON
print(json.dumps(dataset.sample_json, indent=4, default=str)[:400] + '...')

{
    "species": "Lonchura striata domestica",
    "common_name": "Bengalese finch",
    "wav_loc": "/mnt/cube/tsainbur/Projects/github_repos/avgn_paper/data/raw/koumura/zip_contents/Bird8/Wave/108.wav",
    "samplerate_hz": 32000,
    "length_s": 8.8725,
    "indvs": {
        "Bird8": {
            "notes": {
                "start_times": [
                    1.158,
                    1.323,
...


In [6]:
# how many wavs are in the dataset?
len(dataset.data_files)

2964

### Perform segmentation

In [9]:
import librosa

In [11]:
### segmentation parameters
n_fft=1024
hop_length_ms=2
win_length_ms=4
ref_level_db=20
pre=0.97
min_level_db=-60
min_level_db_floor = -20
db_delta = 5
silence_threshold = 0.05
min_silence_for_spec=0.5
max_vocal_for_spec=0.5,
min_syllable_length_s = 0.01
butter_min = 500
butter_max = 15000
spectral_range = [500, 15000]

#### First lets try segmenting an example to make sure the segmentation looks good

In [None]:
# segment
results = dynamic_threshold_segmentation(
    data,
    rate,
    n_fft=n_fft,
    hop_length_ms=hop_length_ms,
    win_length_ms=win_length_ms,
    min_level_db_floor=min_level_db_floor,
    db_delta=db_delta,
    ref_level_db=ref_level_db,
    pre=pre,
    min_silence_for_spec=min_silence_for_spec,
    max_vocal_for_spec=max_vocal_for_spec,
    min_level_db=min_level_db,
    silence_threshold=silence_threshold,
    verbose=True,
    min_syllable_length_s=min_syllable_length_s,
    spectral_range=spectral_range,
)