In [1]:
cd /home/ildefonso/Documents/repos/vocalpy/Nicholson-Cohen-2023

/home/ildefonso/Documents/repos/vocalpy/Nicholson-Cohen-2023


# tweetynet-canary dataset prep

This script documents how the dataset was prepared from the canary dataset accompanying the TweetyNet paper.

It assumes that the dataset has been downloaded and that all the audio (wav) files and annotation (csv) files have been moved into a single directory, below called `data_dir`.

In [1]:
from collections import Counter
import json
import pathlib

import crowsetta
import numpy as np
import vocalpy as voc

## Remapping labels

To combine song from all 3 birds into one dataset, we first re-label so labels across all birds are unique.

In [2]:
labelsets = {
    'llb3': [str(int_lbl) for int_lbl in range(1, 21)],
    'llb11': [str(int_lbl) for int_lbl in range(1, 28)],
    'llb16': [str(int_lbl) for int_lbl in range(1, 31)],
}

In [3]:
# start at 1, not 0, so we don't confusingly clash with "unlabeled" class
ctr = 1
new_labelsets = {}
for bird_id, labelset in labelsets.items():
    new_labelset = [str(int_lbl) for int_lbl in range(ctr, ctr + len(labelset))]
    assert len(new_labelset) == len(labelset)
    new_labelsets[bird_id] = new_labelset
    ctr += len(labelset)

In [4]:
print(new_labelsets)

{'llb3': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'], 'llb11': ['21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47'], 'llb16': ['48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77']}


In [5]:
relabel_mapping = {}
for bird_id in labelsets.keys():
    relabel_mapping[bird_id] = dict(
        zip(
            labelsets[bird_id], new_labelsets[bird_id]
        )
    )

In [8]:
with pathlib.Path('./data/tweetynet-canary-all-birds-relabel-mapping.json').open('w') as fp:
    json.dump(relabel_mapping, fp, indent=4)

In [6]:
data_dir = pathlib.Path(
    '/home/ildefonso/Documents/data/vocal/tweetynet-canary-dataset/all-birds/'
)

Next cell verifies we have occurrences of all labels, before re-labeling

In [17]:
scribe = crowsetta.Transcriber(format='simple-seq')

for bird_id, labelset in labelsets.items():
    bird_csvs = sorted(data_dir.glob(f'{bird_id}*.wav.csv'))
    print(f'Counting label occurrences in {len(bird_csvs)} annotation files for bird: {bird_id}')
    simple_seqs = [scribe.from_file(bird_csv) for bird_csv in bird_csvs]
    counts = Counter([
        lbl for simpleseq in simple_seqs for lbl in simpleseq.labels
    ])
    print(f"Counts:\n{sorted(counts.items(), key=lambda tup: int(tup[0])}")

Counting label occurrences in 2655 annotation files for bird: llb3
Counts:
[('1', 40074), ('2', 32220), ('3', 164597), ('4', 616), ('5', 719), ('6', 15888), ('7', 10923), ('8', 10269), ('9', 3833), ('10', 1434), ('11', 21444), ('12', 4512), ('13', 3688), ('14', 11335), ('15', 127), ('16', 1084), ('17', 3271), ('18', 23919), ('19', 2063), ('20', 1355)]
Counting label occurrences in 2031 annotation files for bird: llb11
Counts:
[('1', 47112), ('2', 41450), ('3', 3339), ('4', 545), ('5', 12250), ('6', 2529), ('7', 49691), ('8', 49959), ('9', 24504), ('10', 15096), ('11', 3625), ('12', 2920), ('13', 1294), ('14', 5682), ('15', 4234), ('16', 4600), ('17', 344), ('18', 23470), ('19', 8732), ('20', 6223), ('21', 9750), ('22', 352), ('23', 1627), ('24', 1420), ('25', 621), ('26', 77), ('27', 5169)]
Counting label occurrences in 1452 annotation files for bird: llb16
Counts:
[('1', 29663), ('2', 24330), ('3', 3546), ('4', 14875), ('5', 5245), ('6', 1686), ('7', 4516), ('8', 9885), ('9', 4877), (

In [11]:
scribe = crowsetta.Transcriber(format='simple-seq')

for bird_id, labelmap in relabel_mapping.items():
    print(f"Label mapping (old to new): {labelmap}")

    bird_csvs = sorted(data_dir.glob(f'{bird_id}*.wav.csv'))
    print(f'Relabeling {len(bird_csvs)} annotations')
    simple_seqs = [scribe.from_file(bird_csv) for bird_csv in bird_csvs]
    for simple_seq in simple_seqs:
        simple_seq.labels = [labelmap[str(label)] if str(label) in labelmap else str(label)
                             for label in simple_seq.labels.tolist()]
        simple_seq.to_file(simple_seq.annot_path)

Label mapping (old to new): {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20'}
Relabeling 2655 annotations
Label mapping (old to new): {'1': '21', '2': '22', '3': '23', '4': '24', '5': '25', '6': '26', '7': '27', '8': '28', '9': '29', '10': '30', '11': '31', '12': '32', '13': '33', '14': '34', '15': '35', '16': '36', '17': '37', '18': '38', '19': '39', '20': '40', '21': '41', '22': '42', '23': '43', '24': '44', '25': '45', '26': '46', '27': '47'}
Relabeling 2031 annotations
Label mapping (old to new): {'1': '48', '2': '49', '3': '50', '4': '51', '5': '52', '6': '53', '7': '54', '8': '55', '9': '56', '10': '57', '11': '58', '12': '59', '13': '60', '14': '61', '15': '62', '16': '63', '17': '64', '18': '65', '19': '66', '20': '67', '21': '68', '22': '69', '23': '70', '24': '71', '25': '72', '26': '73', '27': '74', '28':

## Make sure there are no onsets less than zero

We also need to make sure there are no onsets less than zero. This is a quirk of the Matlab GUI annotator, that it sometimes sets the first onset to be less than zero, so we fix it in the annotations.

In [14]:
scribe = crowsetta.Transcriber(format='simple-seq')

for bird_id in relabel_mapping.keys():
    bird_csvs = sorted(data_dir.glob(f'{bird_id}*.wav.csv'))
    print(f'Fixing onset times less than 0 in {len(bird_csvs)} annotations')
    simple_seqs = [scribe.from_file(bird_csv) for bird_csv in bird_csvs]
    n_onsets_lt_0 = 0
    for simple_seq in simple_seqs:
        if np.any(simple_seq.onsets_s < 0.):
            n_onsets_lt_0 += 1
    print(f"Bird ID: {bird_id}, n. seqs with onsets less than 0: {n_onsets_lt_0}, n. seqs with offsets less than 0: {n_offsets_lt_0}")
        # simple_seq.to_file(simple_seq.annot_path)


Fixing onset times less than 0 in 2655 annotations
Bird ID: llb3, n. seqs with onsets less than 0: 0, n. seqs with offsets less than 0: 0
Fixing onset times less than 0 in 2031 annotations
Bird ID: llb11, n. seqs with onsets less than 0: 0, n. seqs with offsets less than 0: 0
Fixing onset times less than 0 in 1452 annotations
Bird ID: llb16, n. seqs with onsets less than 0: 0, n. seqs with offsets less than 0: 0


Finally fix any other weird quirks that prevent us from computing segmentation metrics

## Remove any remaining annotation files where all onset/offset times are not strictly increasing

There are a few cases where onset or offset times are not strictly increasing -- the segments overlap with each other slightly.
Rather than fix this by hand we just remove these few files.

In [7]:
from collections import defaultdict

In [15]:
scribe = crowsetta.Transcriber(format='simple-seq')

needs_removing = defaultdict(list)

for bird_id in relabel_mapping.keys():
    bird_csvs = sorted(data_dir.glob(f'{bird_id}*.wav.csv'))
    print(f'Checking onset and offset times from {len(bird_csvs)} annotations')
    simple_seqs = [scribe.from_file(bird_csv) for bird_csv in bird_csvs]
    for simple_seq in simple_seqs:
        try:
            voc.metrics.segmentation.ir.concat_starts_and_stops(simple_seq.onsets_s, simple_seq.offsets_s)
        except:
            needs_removing[bird_id].append(
                simple_seq
            )

Checking onset and offset times from 2655 annotations
Checking onset and offset times from 2031 annotations
Checking onset and offset times from 1452 annotations


In [20]:
for bird_id in relabel_mapping.keys():
    if bird_id not in needs_removing:
        print(f"No issue with onset/offset times for bird ID: {bird_id}")
    else:
        bird_csvs = sorted(data_dir.glob(f'{bird_id}*.wav.csv'))
        n_that_need_removing = len(needs_removing[bird_id])
        print(
            f"Number that will be removed from dataset for bird ID {bird_id}: {n_that_need_removing} / {len(bird_csvs)} = "
            f"{n_that_need_removing / len(bird_csvs) * 100:.2f}%"
        )

Number that will be removed from dataset for bird ID llb3: 17 / 2655 = 0.64%
Number that will be removed from dataset for bird ID llb11: 4 / 2031 = 0.20%
No issue with onset/offset times for bird ID: llb16


In [26]:
for bird_id, needs_removing_list in needs_removing.items():
    print(f"Removing {len(needs_removing_list)} wav files + annotations for bird ID: {bird_id}")
    for simpleseq_to_remove in needs_removing_list:
        simpleseq_to_remove.annot_path.unlink()
        wav_path_to_remove = simpleseq_to_remove.annot_path.parent / simpleseq_to_remove.annot_path.name.replace('.csv', '')
        wav_path_to_remove.unlink()

Removing 17 wav files + annotations for bird ID: llb3
Removing 4 wav files + annotations for bird ID: llb11


Confirm we removed them

In [28]:
for bird_id, needs_removing_list in needs_removing.items():
    for simpleseq_to_remove in needs_removing_list:
        assert not simpleseq_to_remove.annot_path.exists()
        wav_path_to_remove = simpleseq_to_remove.annot_path.parent / simpleseq_to_remove.annot_path.name.replace('.csv', '')
        assert not wav_path_to_remove.exists()