In [None]:
%load_ext autoreload
%autoreload 2

# Prepare sequence dataframe of starlings
**This script takes data from an already generated UMAP clustered dataset**
1. Load umap pickled datasets
2. reformat to a sequence dataset

In [None]:
from glob import glob
from parallelspaper.config.paths import DATA_DIR
import pandas as pd
from datetime import datetime
import numpy as np
from tqdm.autonotebook import tqdm
from parallelspaper.birdsong_datasets import prep_STARLING

### load/prep data

In [None]:
# find datasets
sequence_dfs = glob(str(DATA_DIR / 'raw/starling_umap/*/*/*.pickle'))
isi_thresh = 10

In [None]:
song_df, seq_lens, syllable_duration_s, ISIs = prep_STARLING(sequence_dfs, isi_thresh = isi_thresh)

In [None]:
# save dataset
song_df.to_pickle(DATA_DIR / 'song_seq_df/starling.pickle')

In [None]:
song_df[:3]

In [None]:
song_df[song_df.bird == 'B335'][:3]

### Make Stats DF

In [None]:
stats_df = pd.DataFrame(
    columns = [
        'species',
        
        'unique_birds',
        'num_birds',
        
        'num_syllables_per_bird',
        'num_syllables_total',
        'num_recordings',
        'recording_duration_syllable',
        
        'unique_syllables_per_bird',
        'unique_syllables_total',
        
        'recordings_length_total',
        'recordings_lengths',
        'syllable_duration_s',
        'isi'
        ]
)

In [None]:
species = 'Starling'
unique_birds = np.unique(song_df[song_df.species==species].bird)
num_birds = len(unique_birds)
num_syllables_per_bird = [len(np.concatenate(song_df[song_df.bird.values == bird].syllables.values)) for bird in unique_birds]
unique_syllables_per_bird = [len(np.unique(np.concatenate(song_df[song_df.bird.values == bird].syllables.values))) for bird in unique_birds]
num_recordings = len(song_df[song_df.species==species])
recording_duration_syllable = [len(i) for i in song_df[song_df.species==species].syllables]
unique_syllables_total = len(np.unique(np.concatenate(song_df[song_df.species==species].syllables.values)))
num_syllables_total = np.sum(recording_duration_syllable)

In [None]:
recordings_length_total = np.sum(seq_lens)
recordings_lengths = seq_lens

In [None]:
stats_df.loc[len(stats_df)] = [
    species,

    unique_birds,
    num_birds,

    num_syllables_per_bird,
    num_syllables_total,
    num_recordings,
    recording_duration_syllable,
    
    unique_syllables_per_bird,
    unique_syllables_total,
    
    recordings_length_total,
    recordings_lengths,
    syllable_duration_s,
    ISIs
]

In [None]:
stats_df

In [None]:
stats_df.to_pickle(DATA_DIR/'stats_df/starling_stats_df.pickle')