In [1]:
cd /home/ildefonso/Documents/ExtraDrive/repos/vocalpy/Nicholson-Cohen-2024-bio-sound-seg-bench-0.1/

/home/ildefonso/Documents/ExtraDrive/repos/vocalpy/Nicholson-Cohen-2024-bio-sound-seg-bench-0.1


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


# tweetynet-canary dataset prep

This script documents how the dataset was prepared from the canary dataset accompanying the TweetyNet paper.

It assumes that the dataset has been downloaded into a directory, below called `DATA_DIR`.

In [9]:
from collections import defaultdict
import pathlib
import shutil

import crowsetta
import numpy as np
import vocalpy as voc

In [3]:
DATASET_ROOT = pathlib.Path(
    './data/raw/Canary-Song/tweetynet-canary/'
)

BIRD_IDS = [
    'llb11',
    'llb16',
    'llb3',
]

In [4]:
DRY_RUN = False

## Make sure there are no onsets less than zero

We also need to make sure there are no onsets less than zero. This is a quirk of the Matlab GUI annotator, that it sometimes sets the first onset to be less than zero, so we fix it in the annotations.

We check whether the first onset is less than zero, and also whether any other onsets / offsets are less than zero--there should only be cases where the first onset is less than zero, but we check anyways to be sure.

In [5]:
SCRIBE = crowsetta.Transcriber(format='simple-seq')

In [6]:
first_onset_lt_zero = defaultdict(list)
any_onset_lt_zero = defaultdict(list)
any_offset_lt_zero = defaultdict(list)

for bird_id in BIRD_IDS:
    files_root = DATASET_ROOT / f"{bird_id}_data/{bird_id}_songs"
    wav_paths = voc.paths.from_dir(files_root, '.wav')
    csv_paths = voc.paths.from_dir(files_root, '.wav.csv')
    assert len(wav_paths) == len(csv_paths), "len(wav_paths) != len(csv_paths)"
    for wav_path, csv_path in zip(wav_paths, csv_paths):
        simpleseq = SCRIBE.from_file(csv_path)
        if simpleseq.onsets_s[0] < 0.:
            print(
                f"File has first onset less than 0: {csv_path.name}"
            )
            first_onset_lt_zero[bird_id].append(
                (wav_path, csv_path)
            )
        elif np.any(simpleseq.onsets_s[1:]) < 0.:
            print(
                f"File has onset (other than first) less than 0: {csv_path.name}"
            )
            any_onset_lt_zero[bird_id].append((wav_path, csv_path))
        elif np.any(simpleseq.offsets_s) < 0.:
            print(
                f"File has offset less than 0: {csv_path.name}"
            )
            any_offset_lt_zero[bird_id].append((wav_path, csv_path))

File has first onset less than 0: llb16_0275_2018_05_05_13_27_07.wav.csv
File has first onset less than 0: llb3_0071_2018_04_23_17_38_30.wav.csv
File has first onset less than 0: llb3_0533_2018_04_25_08_43_56.wav.csv
File has first onset less than 0: llb3_0613_2018_04_25_13_55_33.wav.csv
File has first onset less than 0: llb3_0770_2018_04_25_17_06_20.wav.csv
File has first onset less than 0: llb3_1055_2018_04_26_10_21_58.wav.csv
File has first onset less than 0: llb3_1112_2018_04_26_11_39_52.wav.csv
File has first onset less than 0: llb3_1475_2018_04_27_06_04_48.wav.csv
File has first onset less than 0: llb3_1707_2018_04_27_13_04_15.wav.csv
File has first onset less than 0: llb3_1826_2018_04_27_15_26_59.wav.csv
File has first onset less than 0: llb3_2172_2018_04_28_11_27_38.wav.csv
File has first onset less than 0: llb3_2294_2018_04_28_13_44_19.wav.csv
File has first onset less than 0: llb3_2530_2018_04_29_05_45_17.wav.csv
File has first onset less than 0: llb3_2538_2018_04_29_05_53_05

In [7]:
if any(
    len(list_) > 0 for list_ in any_onset_lt_zero.values()
):
    raise ValueError()
elif any(
    len(list_) > 0 for list_ in any_offset_lt_zero.values()
):
    raise ValueError()

In [10]:
for bird_id, first_offset_lt_zero_list in first_onset_lt_zero.items():
    if len(first_offset_lt_zero_list) < 1:
        print(f"No files with first offset less than zero for bird ID: {bird_id}")
        continue

    files_root = DATASET_ROOT / f"{bird_id}_data/{bird_id}_songs"
    all_wav_paths = voc.paths.from_dir(files_root, '.wav')
    n_that_need_removing = len(first_offset_lt_zero_list)
    print(
        f"Number that will be removed from dataset for bird ID {bird_id}: {n_that_need_removing} / {len(all_wav_paths)} = "
        f"{n_that_need_removing / len(all_wav_paths) * 100:.2f}%"
    )

    first_offset_lt_zero_dir = files_root / "first_offset_lt_zero"
    if not DRY_RUN:
        first_offset_lt_zero_dir.mkdir(exist_ok=True)
    for wav_path, csv_path in first_offset_lt_zero_list:
        print(f"moving wav and csv path for: {wav_path.stem}")
        if not DRY_RUN:
            shutil.move(wav_path, first_offset_lt_zero_dir)
            shutil.move(csv_path, first_offset_lt_zero_dir)

Number that will be removed from dataset for bird ID llb16: 1 / 1452 = 0.07%
moving wav and csv path for: llb16_0275_2018_05_05_13_27_07
Number that will be removed from dataset for bird ID llb3: 24 / 2655 = 0.90%
moving wav and csv path for: llb3_0071_2018_04_23_17_38_30
moving wav and csv path for: llb3_0533_2018_04_25_08_43_56
moving wav and csv path for: llb3_0613_2018_04_25_13_55_33
moving wav and csv path for: llb3_0770_2018_04_25_17_06_20
moving wav and csv path for: llb3_1055_2018_04_26_10_21_58
moving wav and csv path for: llb3_1112_2018_04_26_11_39_52
moving wav and csv path for: llb3_1475_2018_04_27_06_04_48
moving wav and csv path for: llb3_1707_2018_04_27_13_04_15
moving wav and csv path for: llb3_1826_2018_04_27_15_26_59
moving wav and csv path for: llb3_2172_2018_04_28_11_27_38
moving wav and csv path for: llb3_2294_2018_04_28_13_44_19
moving wav and csv path for: llb3_2530_2018_04_29_05_45_17
moving wav and csv path for: llb3_2538_2018_04_29_05_53_05
moving wav and csv 

## Remove any remaining annotation files where all onset/offset times are not strictly increasing

There are a few cases where onset or offset times are not strictly increasing -- the segments overlap with each other slightly.
Rather than fix this by hand we just remove these few files.

In [11]:
needs_removing = defaultdict(list)

for bird_id in BIRD_IDS:
    files_root = DATASET_ROOT / f"{bird_id}_data/{bird_id}_songs"
    wav_paths = voc.paths.from_dir(files_root, '.wav')
    csv_paths = voc.paths.from_dir(files_root, '.wav.csv')
    assert len(wav_paths) == len(csv_paths), "len(wav_paths) != len(csv_paths)"
    for wav_path, csv_path in zip(wav_paths, csv_paths):
        simpleseq = SCRIBE.from_file(csv_path)
        try:
            voc.metrics.segmentation.ir.concat_starts_and_stops(
                simpleseq.onsets_s, simpleseq.offsets_s
            )
        except:
            print(
                f"caused error when concatenating starts and stops: {csv_path.name}"
            )
            needs_removing[bird_id].append(
                (wav_path, csv_path)
            )

caused error when concatenating starts and stops: llb11_01876_2018_05_08_11_55_42.wav.csv
caused error when concatenating starts and stops: llb11_02147_2018_05_08_16_01_55.wav.csv
caused error when concatenating starts and stops: llb11_02473_2018_05_09_09_57_01.wav.csv
caused error when concatenating starts and stops: llb11_03556_2018_05_11_10_08_16.wav.csv
caused error when concatenating starts and stops: llb3_0357_2018_04_24_17_09_44.wav.csv
caused error when concatenating starts and stops: llb3_0668_2018_04_25_14_50_17.wav.csv
caused error when concatenating starts and stops: llb3_0800_2018_04_25_17_49_23.wav.csv
caused error when concatenating starts and stops: llb3_0804_2018_04_25_17_50_45.wav.csv
caused error when concatenating starts and stops: llb3_0824_2018_04_25_18_20_08.wav.csv
caused error when concatenating starts and stops: llb3_1122_2018_04_26_11_49_21.wav.csv
caused error when concatenating starts and stops: llb3_1124_2018_04_26_11_50_05.wav.csv
caused error when concat

In [12]:
for bird_id, needs_removing_list in needs_removing.items():
    if len(needs_removing_list) < 1:
        print(f"No issue with onset/offset times for bird ID: {bird_id}")
        continue

    files_root = DATASET_ROOT / f"{bird_id}_data/{bird_id}_songs"
    all_wav_paths = voc.paths.from_dir(files_root, '.wav')
    n_that_need_removing = len(needs_removing[bird_id])
    print(
        f"Number that will be removed from dataset for bird ID {bird_id}: {n_that_need_removing} / {len(all_wav_paths)} = "
        f"{n_that_need_removing / len(all_wav_paths) * 100:.2f}%"
    )
    onset_offset_times_issues_dir = files_root / "onset_offset_times_issues"
    if not DRY_RUN:
        onset_offset_times_issues_dir.mkdir(exist_ok=True)
    for wav_path, csv_path in needs_removing_list:
        print(f"moving wav and csv path for: {wav_path.stem}")
        if not DRY_RUN:
            shutil.move(wav_path, onset_offset_times_issues_dir)
            shutil.move(csv_path, onset_offset_times_issues_dir)


Number that will be removed from dataset for bird ID llb11: 4 / 2031 = 0.20%
moving wav and csv path for: llb11_01876_2018_05_08_11_55_42
moving wav and csv path for: llb11_02147_2018_05_08_16_01_55
moving wav and csv path for: llb11_02473_2018_05_09_09_57_01
moving wav and csv path for: llb11_03556_2018_05_11_10_08_16
Number that will be removed from dataset for bird ID llb3: 17 / 2631 = 0.65%
moving wav and csv path for: llb3_0357_2018_04_24_17_09_44
moving wav and csv path for: llb3_0668_2018_04_25_14_50_17
moving wav and csv path for: llb3_0800_2018_04_25_17_49_23
moving wav and csv path for: llb3_0804_2018_04_25_17_50_45
moving wav and csv path for: llb3_0824_2018_04_25_18_20_08
moving wav and csv path for: llb3_1122_2018_04_26_11_49_21
moving wav and csv path for: llb3_1124_2018_04_26_11_50_05
moving wav and csv path for: llb3_1170_2018_04_26_12_25_11
moving wav and csv path for: llb3_1412_2018_04_26_17_51_20
moving wav and csv path for: llb3_1559_2018_04_27_08_04_31
moving wav a