### California thrasher custom parsing
- An labelled (but smaller) dataset of zebra finch vocalizations
    - .WAV files with individual labels
- This notebook creates a JSON corresponding to each WAV file (and Noise file where available).
- Dataset origin:
    - http://taylor0.biology.ucla.edu/birdDBQuery/

In [1]:
from avgn.utils.general import prepare_env



In [2]:
prepare_env()

env: CUDA_VISIBLE_DEVICES=GPU


### Import relevant packages

In [3]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
pd.options.display.max_columns = None
import librosa
from datetime import datetime
import numpy as np
import xlrd

In [4]:
import avgn
from avgn.custom_parsing.bird_db import generate_json
from avgn.utils.paths import DATA_DIR

### Load data in original format

In [5]:
DATASET_ID = 'california_thrasher_cody'

In [6]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2019-09-30_12-21-15'

In [7]:
DSLOC = DATA_DIR/ 'raw' / 'bird-db' / 'CATH'
DSLOC

PosixPath('/mnt/cube/tsainbur/Projects/github_repos/avgn_paper/data/raw/bird-db/CATH')

In [8]:
WAVLIST = list((DSLOC).expanduser().glob('*/wavs/*.wav'))
len(WAVLIST), WAVLIST[0]

(95,
 PosixPath('/mnt/cube/tsainbur/Projects/github_repos/avgn_paper/data/raw/bird-db/CATH/CATH-TLR1/wavs/2012-01-20_08-15-00-000000.wav'))

In [9]:
song_db = pd.read_excel(DATA_DIR / "BIRD_DB.xls")
mainData_book = xlrd.open_workbook(DATA_DIR / "BIRD_DB.xls", formatting_info=True)
mainData_sheet = mainData_book.sheet_by_index(0)
song_urls = [
    ""
    if mainData_sheet.hyperlink_map.get((i, 11)) == None
    else mainData_sheet.hyperlink_map.get((i, 11)).url_or_path
    for i in range(mainData_sheet.nrows)
]
song_db["Audio_file"] = song_urls[1:]
song_db = song_db[1:]
song_db["file_stem"] = [
    i.split("/")[-1].split(".")[0] for i in song_db["Audio_file"].values
]

In [10]:
song_db[:3]

Unnamed: 0,TrackName,type_of_device,configuration,microphone,recorder,sample_rate,Recordist_first_name,Recordist_last_name,recording_date,recording_time,recording_length,Audio_file,SubjectName,subject_importance,quality_rating,Species_short_name,Subject_species,sex,age_class,identified_by,certainty_of_species,latitude_degrees,latitude_minutes,latitude_seconds,latitude_orientation,longitude_degrees,longitude_minutes,longitude_seconds,longitude_orientation,elevation,Distance_from_marker,position,Marker,Area,Region,country,state_or_province,vegetation_type,Analysis,Analysis_date,Analysis_method,number_of_phrases,Textgrid_file,Keycode_file,file_stem
1,tCOMM09-1,single microphone,MOC2012,Sennheiser omnidirectional with Telinga parabo...,Marantz PMD650,44.0,Martin,Cody,2009-03-21,08:25:00,00:02:30,http://taylor0.biology.ucla.edu/birdDBQuery/Fi...,CATH-CP1,Primary subject,4.0,CATH,California Thrasher,Male(s),Reproductive adult,sight and sound,5.0,38.0,15.0,17.95,N,120.0,53.0,5.43,W,168.0,,,Along road,Lake Comanche,Amador,USA,California,Open Chapparal,CATH processed,2013-02-23,Praat annotation,383.0,Files_TextGrids/2009/March/CATH1.TextGrid,Files_Keys/CATH_2009/TRIPLELETTER_N_CA_phrases...,CATH1
2,tCOMM09-2,single microphone,MOC2012,Sennheiser omnidirectional with Telinga parabo...,Marantz PMD650,44.0,Martin,Cody,2009-03-21,08:27:00,00:00:51,http://taylor0.biology.ucla.edu/birdDBQuery/Fi...,CATH-CP1,Primary subject,4.0,CATH,California Thrasher,Male(s),Reproductive adult,sight and sound,5.0,38.0,15.0,17.95,N,120.0,53.0,5.43,W,168.0,,,Along road,Lake Comanche,Amador,USA,California,Open Chapparal,CATH processed,2013-02-23,Praat annotation,152.0,Files_TextGrids/2009/March/CATH2.TextGrid,Files_Keys/CATH_2009/TRIPLELETTER_N_CA_phrases...,CATH2
3,tCOMM09-3,single microphone,MOC2012,Sennheiser omnidirectional with Telinga parabo...,Marantz PMD650,44.0,Martin,Cody,2009-03-21,08:28:00,00:00:57,http://taylor0.biology.ucla.edu/birdDBQuery/Fi...,CATH-CP1,Primary subject,4.0,CATH,California Thrasher,Male(s),Reproductive adult,sight and sound,5.0,38.0,15.0,17.95,N,120.0,53.0,5.43,W,168.0,,,Along road,Lake Comanche,Amador,USA,California,Open Chapparal,CATH processed,2013-02-23,Praat annotation,140.0,Files_TextGrids/2009/March/CATH3.TextGrid,Files_Keys/CATH_2009/TRIPLELETTER_N_CA_phrases...,CATH3


### Create a JSON for each wav

In [11]:
for wavfile in tqdm(WAVLIST):
    generate_json(
            wavfile,
            DT_ID,
            song_db
        )

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))


