# Make beep1555 songs
### JX, 10/13/2022
> compose fake songs and output them

## load in both dataframes

In [1]:
from starling_rhythm.utils.paths import PROCESSED_DIR, ensure_dir
import pandas as pd

In [2]:
SAVE_PATH = PROCESSED_DIR / 'starling_b1555_syllable_df_labels.pickle'
syllable_df = pd.read_pickle(SAVE_PATH)

In [3]:
SAVE_PATH = PROCESSED_DIR / 'starling_b1555_amp_envs200_spl.pickle'
ae_df = pd.read_pickle(SAVE_PATH).reset_index()

## select only necessary columns to save memory

In [4]:
syllable_df = syllable_df[['ae_df_index', 'onsets_ms', 'offsets_ms', 'hdbscan_labels']]
ae_df = ae_df[['waveform']]

In [5]:
FAKE_SONG_DIR = PROCESSED_DIR/'fake_songs'

In [6]:
FAKE_SONG_DIR

PosixPath('/mnt/cube/j8xing/starling_rhythm_mfdfa/data/fake_songs')

## Synthesis Algorithm

1. Slide 10 second window along syllable start times and capture syllables in that range. 
2. Record actual end time of last syllable
2. For syn_real songs, place down waveforms of original syllables at original start times. 
4. For syn_fake songs, place down waveform templates fo original syllables at tiled start times such that intersyllable gaps are the same. 

In [7]:
import random
from tqdm.autonotebook import tqdm
import numpy as np
import scipy.io.wavfile as wv

  from tqdm.autonotebook import tqdm


In [8]:
def taper_audio(audio, taper):
    '''
    Linearly fade audio in/out
    note: taper is in samples
    '''
    
    ## taper audio
    fade_in = audio[0:taper]*np.linspace(0, 1, num = taper)
    fade_out = audio[-taper:]*np.linspace(1, 0, num = taper)
            
    audio[0:taper] = fade_in
    audio[-taper:] = fade_out
    
    return audio

In [9]:
def fake_songs(bout_nb, sampling_rate = 48000):
    '''
    for a bout_nb, return its real_syn songs, fake_syn_songs, and 100 shuffle_intergap songs
    '''
    real_syns = []
    fake_syns = []
    shuffle_intergaps_total = []
    
    ## grab all of their syllables
    bout = syllable_df[syllable_df['ae_df_index'] == bout_nb]
    
    ## window of inspection is 10 sec / 10000 ms
    window_size = 10
    ## terminal syllable end
    terminal = max(bout['offsets_ms'].values)
    
    ## window
    windows_strt = np.arange(0, terminal - window_size + 1, step = 5)
    windows_end = np.arange(window_size, terminal + 1, step = 5)
    
    ## slide window
    for strt, end in zip(windows_strt, windows_end):
        
        ## between this range, pull out all the syllables 
        bout_slice = bout[(bout['onsets_ms'] > strt) & (bout['offsets_ms'] < end)] 
        
        ## pull out all the syllable labels
        sequence = bout_slice['hdbscan_labels'].values
        
        '''
        Discard rules
        '''
        
        ## discard all segments that don't have at least 10 syllables
        if len(sequence) < 10:
            continue
        
        ## discard all segments that don't have repetition of syllables
        
        #### get reptition counts
        uniques, counts = np.unique(sequence, return_counts = True)
        
        #### any unlabeled syllables are not in this search
        if np.any(uniques == -1):
            uniques = uniques[1:]
            counts = counts[1:]
            
        if len(np.unique(counts)) <= 1:
            continue
        
        #print(strt, end, sequence)
        
        '''
        Generate real synthesized songs:
        '''
        
        min_onset = int(min(bout_slice.onsets_ms) * sampling_rate)
        max_offset = int(max(bout_slice.offsets_ms) * sampling_rate)
        real_syn = np.zeros(max_offset-min_onset)
        taper = 96 ## 2ms
        
        audio = ae_df.loc[bout_nb].waveform
        
        for index, syllable in bout_slice.iterrows():
            ## get the waveform for the syllable
            syllable_sample_onset = int(syllable.onsets_ms * sampling_rate)
            syllable_sample_offset = int(syllable.offsets_ms * sampling_rate)
            syllable_audio = audio[syllable_sample_onset:syllable_sample_offset]
            
            ## taper audio
            syllable_audio = taper_audio(syllable_audio, taper)
            
            ## replace into the blank syn
            real_syn[(syllable_sample_onset-min_onset):(syllable_sample_offset-min_onset)] = syllable_audio
            
        filename = FAKE_SONG_DIR / ('bout_' + str(bout_nb)) / str(str(strt) + '-' + str(end)) / 'real_syn' / ('bout_nb' + str(bout_nb) + '-' + str(strt) + '-' + str(end) + '_real_syn.wav')
        ensure_dir(filename)
        wv.write(filename = filename, data = real_syn, rate = sampling_rate)
        
        '''
        Generate fake synthesized songs:
        '''
        
        # (1) generate a dictionary for labels that appear
        waveform_dict = {'-1': 0}
        for label, count in zip(uniques, counts):

            ## don't record template for not grouped labels
            if label == -1:
                continue

            else:
                ## find the label within syllable_df
                label_df = bout_slice[bout_slice['hdbscan_labels'] == label]
                label_strt = int(label_df['onsets_ms'].values[0] * sampling_rate)
                label_end = int(label_df['offsets_ms'].values[0] * sampling_rate)
                template = audio[label_strt:label_end]
                
                template = taper_audio(template, taper)

                waveform_dict[str(label)] = template
        
        # (2) find all the intersyllable gaps
        gap_list = (bout_slice.onsets_ms.values[1:] - bout_slice.offsets_ms.values[:-1]) * sampling_rate
        total_gap = sum(gap_list)
        gaps = len(bout_slice) - 1
        intergap = int(total_gap/gaps)
        
        # (3) walk through labels, and generate songs
        fake_syn = []
        negative_counter = 0
        negative_df = bout_slice[bout_slice['hdbscan_labels'] == -1]

        for label in bout_slice['hdbscan_labels'].values:

            ## if label is ambiguous, use the existing one
            if label == -1:
                label_strt = int(list(negative_df.iterrows())[negative_counter][1].onsets_ms * sampling_rate)
                label_end = int(list(negative_df.iterrows())[negative_counter][1].offsets_ms * sampling_rate)
                label_audio = audio[label_strt:label_end]
                negative_counter = negative_counter + 1

            ## if label is known, query dictionary
            else:
                label_audio = waveform_dict[str(label)]
                
            ## add the syllable
            fake_syn = np.append(fake_syn, label_audio)
            ## add the intergap
            fake_syn = np.append(fake_syn, np.zeros(intergap)*sampling_rate)
            
        ## delete the last gap by selecting up to the real song length
        fake_syn = fake_syn[:len(real_syn)]
        
        filename = FAKE_SONG_DIR / ('bout_' + str(bout_nb)) / str(str(strt) + '-' + str(end)) / 'fake_syn' / ('bout_nb' + str(bout_nb) + '-' + str(strt) + '-' + str(end) + '_fake_syn.wav')
        ensure_dir(filename)
        wv.write(filename = filename, data = fake_syn, rate = sampling_rate)
        
        '''
        Generate 30 shuffle-intergap songs:
        '''
        shuffle_intergaps = []
        
        for n in np.arange(0, 30):
            
            ## make container
            shuffle_intergap = []
            
            ## shuffle order of intergap
            shuffle_gap_list = gap_list
            random.shuffle(shuffle_gap_list)
            ## add end offset
            shuffle_gap_list = np.append(shuffle_gap_list, [0])
            
            ## reset negative_counter
            negative_counter = 0
            
            for label, gap in zip(bout_slice['hdbscan_labels'].values, shuffle_gap_list):

                ## if label is ambiguous, use the existing one
                if label == -1:
                    label_strt = int(list(negative_df.iterrows())[negative_counter][1].onsets_ms * sampling_rate)
                    label_end = int(list(negative_df.iterrows())[negative_counter][1].offsets_ms * sampling_rate)
                    label_audio = audio[label_strt:label_end]
                    negative_counter = negative_counter + 1

                ## if label is known, query dictionary
                else:
                    label_audio = waveform_dict[str(label)]

                ## add the syllable
                shuffle_intergap = np.append(shuffle_intergap, label_audio)
                ## add the intergap
                shuffle_intergap = np.append(shuffle_intergap, np.zeros(int(gap))*sampling_rate)
                
            ## delete the last gap by selecting up to the real song length
            filename = FAKE_SONG_DIR / ('bout_' + str(bout_nb)) / str(str(strt) + '-' + str(end)) / 'shuffle_intergap' / ('bout_nb' + str(bout_nb) + '-' + str(strt) + '-' + str(end) + '_shuffle_intergap_' + str(n) + '.wav')
            ensure_dir(filename)
            wv.write(filename = filename, data = shuffle_intergap, rate = sampling_rate)
        shuffle_intergaps_total.append(shuffle_intergaps)

In [10]:
from joblib import Parallel, delayed
n_jobs = 36
verbosity = 0

In [11]:
with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel:
    parallel(
        delayed(fake_songs)(
            bout_nb = bout_nb
        )
        for bout_nb in tqdm(
            np.unique(syllable_df['ae_df_index'].values),
            desc="Creating fake songs",
            leave=False,
        )
    )

Creating fake songs:  76%|███████▌  | 792/1041 [3:20:58<53:41, 12.94s/it]  

ValueError: could not broadcast input array from shape (10992,) into shape (11040,)