***
# 0.1 Import pied butcherbird song segmentation data
### JX, 03/07/2022
***

This notebook imports data from:

    >/mnt/cube/j8xing/syntax_rhythm_pbb/data/raw  
    >/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg
    
and saves JSON at:

    >/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/butcherbird/YYYY-MM-DD_HH-MM-SS
***

## Import locations

In [1]:
from butcherbird.utils.paths import DATA_DIR

In [2]:
DATA_DIR

PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data')

In [3]:
rcd_loc = DATA_DIR/'interim/denoised'
tg_loc = DATA_DIR/'interim/tg'

## Import Dataset Tools

In [4]:
from datetime import datetime
import numpy as np
import pandas as pd

import butcherbird.data.setup as setup

  from tqdm.autonotebook import tqdm


In [5]:
## Instantiate DataSet name
DATASET_ID = 'butcherbird'
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
print('The current working dataset is', DATASET_ID, '\nData instantiated at', DT_ID)

The current working dataset is butcherbird 
Data instantiated at 2022-03-07_17-49-43


In [6]:
## grab a list of all the raw waveforms
wav_list = list(rcd_loc.glob('*.wav')) + list (rcd_loc.glob('*.WAV'))

# get number of rcds, display them in sorted order
print('Discovered', len(wav_list), 'Recordings.')
wav_list

Discovered 7 Recordings.


[PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110146A.10.13.2015GeoGreens_dn.wav'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110593.9.8.2018Araluen_dn.wav'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110640.10.1.2018Araluen_dn.wav'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110143.10.12.2015GeoGreens_dn.wav'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110136.10.10.2015GeoGreens_dn.wav'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110131.10.9.2015GeoPool_dn.wav'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110395.9.28.2017PalmRagonesi_dn.wav')]

In [7]:
## grab a list of all the textgrids
tg_list = list(tg_loc.glob('*.TextGrid'))

## get number of tgs, display them in sorted order
print('Discovered', len(wav_list), 'Textgrids.')
tg_list

Discovered 7 Textgrids.


[PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110593_9_8_2018Araluen_Bird22.TextGrid'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110136_10_10_2015GeoGreens_Bird5.TextGrid'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110143_10_12_2015GeoGreens_Bird5.TextGrid'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110640_10_1_2018Araluen_Bird22.TextGrid'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110131_10_9_2015GeoPool_Bird5.TextGrid'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110146A_10_13_2015GeoGreens_Bird5.TextGrid'),
 PosixPath('/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/tg/LS110395_9_28_2017PalmRagonesi_Bird26.TextGrid')]

## Extract Song Segmentation

In [8]:
df_pbb = pd.concat([setup.song_to_df(wav_path, tg_loc) for wav_path in wav_list])

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110146A.10.13.2015GeoGreens_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/489 [00:00<?, ?it/s]

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110593.9.8.2018Araluen_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/431 [00:00<?, ?it/s]

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110640.10.1.2018Araluen_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110143.10.12.2015GeoGreens_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/457 [00:00<?, ?it/s]

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110136.10.10.2015GeoGreens_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/373 [00:00<?, ?it/s]

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110131.10.9.2015GeoPool_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110395.9.28.2017PalmRagonesi_dn.wav


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1427 [00:00<?, ?it/s]

In [9]:
df_pbb

Unnamed: 0_level_0,tg_nm,bird_nm,phrase_nb,phrase_strt,phrase_end,phrase_len,note_cnt,note_nb,note_strt,note_end,note_len
wav_nm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LS110146A.10.13.2015GeoGreens_dn,LS110146A_10_13_2015GeoGreens_Bird5.TextGrid,Bird5,0,798.450000,801.028235,2.578235,13,0,798.450000,798.590953,0.140953
LS110146A.10.13.2015GeoGreens_dn,LS110146A_10_13_2015GeoGreens_Bird5.TextGrid,Bird5,0,798.450000,801.028235,2.578235,13,1,798.710000,798.939041,0.229041
LS110146A.10.13.2015GeoGreens_dn,LS110146A_10_13_2015GeoGreens_Bird5.TextGrid,Bird5,0,798.450000,801.028235,2.578235,13,2,799.100000,799.154704,0.054704
LS110146A.10.13.2015GeoGreens_dn,LS110146A_10_13_2015GeoGreens_Bird5.TextGrid,Bird5,0,798.450000,801.028235,2.578235,13,3,799.211458,799.390000,0.178542
LS110146A.10.13.2015GeoGreens_dn,LS110146A_10_13_2015GeoGreens_Bird5.TextGrid,Bird5,0,798.450000,801.028235,2.578235,13,4,799.430000,799.840000,0.410000
...,...,...,...,...,...,...,...,...,...,...,...
LS110395.9.28.2017PalmRagonesi_dn,LS110395_9_28_2017PalmRagonesi_Bird26.TextGrid,Bird26,1426,7451.938961,7454.270021,2.331060,11,6,7453.230000,7453.347846,0.117846
LS110395.9.28.2017PalmRagonesi_dn,LS110395_9_28_2017PalmRagonesi_Bird26.TextGrid,Bird26,1426,7451.938961,7454.270021,2.331060,11,7,7453.430000,7453.582862,0.152862
LS110395.9.28.2017PalmRagonesi_dn,LS110395_9_28_2017PalmRagonesi_Bird26.TextGrid,Bird26,1426,7451.938961,7454.270021,2.331060,11,8,7453.582862,7453.644792,0.061930
LS110395.9.28.2017PalmRagonesi_dn,LS110395_9_28_2017PalmRagonesi_Bird26.TextGrid,Bird26,1426,7451.938961,7454.270021,2.331060,11,9,7453.786119,7453.917918,0.131800


## Convert df_pbb into JSON for AVGN

In [10]:
## write every wav's json
for wav_path in wav_list:
    setup.wav_to_json(df_pbb, DATASET_ID, DT_ID, wav_path, tg_loc)

/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110146A.10.13.2015GeoGreens_dn.wav
/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110593.9.8.2018Araluen_dn.wav
/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110640.10.1.2018Araluen_dn.wav
/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110143.10.12.2015GeoGreens_dn.wav
/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110136.10.10.2015GeoGreens_dn.wav
/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110131.10.9.2015GeoPool_dn.wav


  json_dict["lengths_s"] = len(read(wav_path)[1])


/mnt/cube/j8xing/syntax_rhythm_pbb/data/interim/denoised/LS110395.9.28.2017PalmRagonesi_dn.wav


  return read(wav_path)[0]
