# Prepare sequence dataframe of Cassin's vireo and California thrasher
1. load transcriptions
2. reformat to a sequence dataset

In [None]:
import numpy as np
import textgrid
from glob import glob
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from datetime import datetime, timedelta
from parallelspaper.birdsong_datasets import prep_CAVI_CATH
from parallelspaper.config.paths import DATA_DIR

### load data

In [None]:
# I grabbed all of the bird-db transcripts and hosted them locally for this
bird_db_locs = '/mnt/cube/Datasets/bird-db/songs/*/*'

In [None]:
isi_thresh = 60
all_indvs = glob(bird_db_locs)

In [None]:
(song_df, CATH_isi, CAVI_isi, CATH_syll_lens, CAVI_syll_lens, CATH_grid_lens,
 CAVI_grid_lens, CAVI_isi, CATH_isi) = prep_CAVI_CATH(all_indvs, isi_thresh=isi_thresh)

In [None]:
song_df[:3]

In [None]:
# save dataset
song_df[['bird', 'species', 'syllables', 'rec_num', 'day']].to_pickle(DATA_DIR / 'song_seq_df/CAVI_CATH.pickle')

##### make a second dataset which includes all vocalizations within the same recording (for longest recording figure)

In [None]:
(song_df, CATH_isi, CAVI_isi, CATH_syll_lens, CAVI_syll_lens, CATH_grid_lens,
 CAVI_grid_lens, CAVI_isi, CATH_isi) = prep_CAVI_CATH(all_indvs, isi_thresh=10e5)

In [None]:
# save dataset
song_df[['bird', 'species', 'syllables', 'rec_num', 'day']].to_pickle(DATA_DIR / 'song_seq_df/CAVI_CATH_full_recording.pickle')

### Get song statistics

In [None]:
stats_df = pd.DataFrame(
    columns = [
        'species',
        
        'unique_birds',
        'num_birds',
        
        'num_syllables_per_bird',
        'num_syllables_total',
        'num_recordings',
        'recording_duration_syllable',
        
        'unique_syllables_per_bird',
        'unique_syllables_total',
        
        'recordings_length_total',
        'recordings_lengths',
        'syllable_duration_s',
        'isi'
        ]
)

In [None]:
CATH_all_syll_lens = np.concatenate(CATH_syll_lens)
CAVI_all_syll_lens = np.concatenate(CAVI_syll_lens)
all_CAVI_isi = np.concatenate(CAVI_isi)
all_CATH_isi = np.concatenate(CATH_isi)
cath_seq_lens = [len(i) for i in song_df[song_df.species=='CATH'].syllables]
cavi_seq_lens = [len(i) for i in song_df[song_df.species=='CAVI'].syllables]

In [None]:
species = 'CAVI'
unique_birds = np.unique(song_df[song_df.species==species].indv)
num_birds = len(unique_birds)
num_syllables_per_bird = [len(np.concatenate(song_df[song_df.indv.values == bird].syllables.values)) for bird in unique_birds]
unique_syllables_per_bird = [len(np.unique(np.concatenate(song_df[song_df.indv.values == bird].syllables.values))) for bird in unique_birds]
num_recordings = len(song_df[song_df.species==species])
recording_duration_syllable = [len(i) for i in song_df[song_df.species==species].syllables]
unique_syllables_total = len(np.unique(np.concatenate(song_df[song_df.species==species].syllables.values)))
recordings_length_total = np.sum(CAVI_grid_lens)
recordings_lengths = CAVI_grid_lens
syllable_duration_s = CAVI_all_syll_lens
num_syllables_total = np.sum(recording_duration_syllable)

In [None]:
stats_df.loc[len(stats_df)] = [
    species,

    unique_birds,
    num_birds,

    num_syllables_per_bird,
    num_syllables_total,
    num_recordings,
    recording_duration_syllable,
    
    unique_syllables_per_bird,
    unique_syllables_total,
    
    recordings_length_total,
    recordings_lengths,
    syllable_duration_s,
    all_CAVI_isi
]

In [None]:
species = 'CATH'
unique_birds = np.unique(song_df[song_df.species == species].indv)
num_birds = len(unique_birds)
num_syllables_per_bird = [len(np.concatenate(
    song_df[song_df.indv.values == bird].syllables.values)) for bird in unique_birds]
unique_syllables_per_bird = [len(np.unique(np.concatenate(
    song_df[song_df.indv.values == bird].syllables.values))) for bird in unique_birds]
num_recordings = len(song_df[song_df.species == species])
recording_duration_syllable = [
    len(i) for i in song_df[song_df.species == species].syllables]
unique_syllables_total = len(np.unique(np.concatenate(
    song_df[song_df.species == species].syllables.values)))
recordings_length_total = np.sum(CATH_grid_lens)
recordings_lengths = CATH_grid_lens
syllable_duration_s = CATH_all_syll_lens
num_syllables_total = np.sum(recording_duration_syllable)

In [None]:
stats_df.loc[len(stats_df)] = [
    species,

    unique_birds,
    num_birds,

    num_syllables_per_bird,
    num_syllables_total,
    num_recordings,
    recording_duration_syllable,
    
    unique_syllables_per_bird,
    unique_syllables_total,
    
    recordings_length_total,
    recordings_lengths,
    syllable_duration_s,
    all_CATH_isi
]

In [None]:
stats_df.to_pickle(DATA_DIR / 'stats_df/CAVICATH_stats_df.pickle')