This code loops over all files and cuts up the whole file into song bouts, so they can be filtered based on different criteria.
09/11/2022

In [1]:
import math
import pandas
import os
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from functools import reduce
from math import nan

'motif_finder_unisyl' filters song bouts based on the number of unique syllables in that bout. Currently, it removes bouts with 3 or less unique notes per bout (5 including 'Start' and 'End')

In [2]:
#Function to find the indices of all the occurrence of 'End' in Syllable label
def find_end_indices(list, element):
    indices = [0]
    for idx, value in enumerate(list):
        if value == element:
            indices.append(idx)
    return indices

#Define a function to pick song bouts that are long in length
#Returns a list of data frames
def motif_finder(bouts_list):
    song_w_motif = bouts_list.copy()
    syll_num = []
    short_song_idx = []
    for i in range(len(bouts_list)):
        bout = bouts_list[i]
        syllable_list = bout['Syll Label'].tolist()
        syll_num.append(len(syllable_list))
    avg_syl_num = np.mean(syll_num)
    for i in range(len(syll_num)):
        if syll_num[i] < avg_syl_num/3:
            short_song_idx.append(i)
    #as each index is deleted, the list shortens, so indices have to be updated
    for i in range(len(short_song_idx)):
        short_song_idx[i] -= i
    for j in short_song_idx:
        del song_w_motif[j]
    return song_w_motif

#Define a function to pick song bouts that more than 3 unique syllables in the bout
#Returns a list of data frames
def motif_finder_unisyl(bouts_list):
    song_w_motif = bouts_list.copy()
    uni_syll_num = []
    short_song_idx = []
    for i in range(len(bouts_list)):
        bout = bouts_list[i]
        syl_n = bout['Syll Label'].value_counts()
        uni_syll_num.append(len(syl_n))
    #print(uni_syll_num)
    for i in range(len(uni_syll_num)):
        #Start, End, 3 unique syllables = 5
        #Any bout with only 3 unique syllables should be removed as a call bout
        if uni_syll_num[i] < 6:
            short_song_idx.append(i)
    #as each index is deleted, the list shortens, so indices have to be updated
    for i in range(len(short_song_idx)):
        short_song_idx[i] -= i
    for j in short_song_idx:
        del song_w_motif[j]
    return song_w_motif

#Combines all the dataframes in a list
def concatenate_bout_list(bout_list):
    new_df = pandas.DataFrame()
    for i in range(len(bout_list)):
        new_df = pandas.concat([new_df, bout_list[i]])
    start_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    new_df = pandas.DataFrame(np.insert(new_df.values, 0, start_row, axis=0))
    return new_df

In [3]:
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\Loop\\Start_End'

start_end_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        start_end_files.append(filename.path)

In [10]:
labels = ['BCC_M01', 'BCC_M03', 'BCC_M08', 'BCC_M09', 'BCC_M11', 'BCC_M19', 'BCC_M21', 'BCC_M22', 
    'BF_brn24pnk13', 'BF_brn25pnk14', 'BF_brn35pnk21', 'BF_brn36pnk24', 'BF_org27ylw19',
    'JF_red28blu13', 'JF_ylw14gry00', 'JF_ylw20gry09', 'JF_ylw22gry08', 'JF_ylw25gry11',
    'ZF_grn21org41', 'ZF_org01wht58', 'ZF_org11pnk05', 'ZF_pnk93pnk91', 'ZF_red15ylw15', 'ZF_red77pnk45', 'ZF_ylw67brn42', 'ZF_ylw95ylw29']

sp_label = ['BCC', 'BCC', 'BCC', 'BCC', 'BCC', 'BCC', 'BCC', 'BCC', 
    'BF', 'BF', 'BF', 'BF', 'BF', 
    'JF', 'JF', 'JF', 'JF', 'JF',
    'ZF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF', ]


In [None]:
labels = ['ASB_Blackblue', 'ASB_orange', 'ASB_red', 'ASB_white', 'ASB_y36a', 'ASB_yellow',
          'BCC_M01', 'BCC_M03', 'BCC_M08', 'BCC_M09', 'BCC_M11', 'BCC_M19', 'BCC_M21', 'BCC_M22',
          'BF_brn20pnk9', 'BF_brn24pnk13', 'BF_brn25pnk14', 'BF_brn35pnk21', 'BF_brn36pnk24', 'BF_org20ylw24', 'BF_org27ylw19',
          'ISB_r5y81', 'ISB_y51i', 'ISB_y54i', 'ISB_y61i', 'ISB_y63i',
          'JF_red28blu13', 'JF_y23g09', 'JF_ylw14gry00', 'JF_ylw20gry09', 'JF_ylw22gry08', 'JF_ylw25gry11',
          'SF_y08s', 'SF_y14s', 'SF_y47s', 'SF_y50s',
          'ZF_grn21org41', 'ZF_org01wht58', 'ZF_org11pnk05', 'ZF_pnk93pnk91', 'ZF_red15ylw15', 'ZF_red77pnk45', 'ZF_ylw67brn42', 'ZF_ylw95ylw29']

sp_label = ['ASB', 'ASB', 'ASB', 'ASB', 'ASB', 'ASB',
            'BCC', 'BCC', 'BCC', 'BCC', 'BCC', 'BCC', 'BCC', 'BCC',
            'BF', 'BF', 'BF', 'BF', 'BF', 'BF', 'BF',
            'ISB', 'ISB', 'ISB', 'ISB', 'ISB',
            'JF', 'JF', 'JF', 'JF', 'JF', 'JF',
            'SF', 'SF', 'SF', 'SF',
            'ZF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF']

In [5]:
save_dir_sb = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\Loop\\temp\\'

for file_idx in range(len(start_end_files)):
    data = pandas.read_csv(start_end_files[file_idx], header=None,
        names = ['FileName', 'Syll #', 'Syll Label',
        ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
        ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
        'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])
    
    syl = data['Syll Label']
    all_syl = syl.tolist()
    end_indices = find_end_indices(all_syl, 'End')

    song_bouts = []
    #This method of slicing includes 'Start' and 'End'
    for i in range(len(end_indices)-1):
        x = data.iloc[end_indices[i] + 1 : end_indices[i+1] + 1, :]
        song_bouts.append(x)

    song_motif = motif_finder_unisyl(song_bouts)

    motif_bouts = concatenate_bout_list(song_motif)
    motif_bouts.columns = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']

    fname_sb = save_dir_sb + labels[file_idx] + '_bout2_start_end.csv'
    motif_bouts.to_csv(fname_sb, header=False, index=False)


Calculate everything for those files with song motifs (song bouts with sufficiently high number of unoque syllables)

In [6]:
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\Loop\\temp'

song_bout_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        song_bout_files.append(filename.path)


In [7]:
save_dir_bout_tp = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\Loop\\temp\\'

for file_idx in range(len(song_bout_files)):
   data = pandas.read_csv(song_bout_files[file_idx], header=None,
      names = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])

   syl = data['Syll Label']
   #Get the set of unique syllables for the individual with their own index (so convert array to list)
   uniq_syl = syl.unique()
   uniq_syl = uniq_syl.tolist()
   #Get the number of syllables to create a N x N matrix for transition
   N = len(uniq_syl)
   trans_matrix = np.array(np.zeros((N, N), dtype = int))
   #Convert 'object' type to array to get indices
   syl = np.array(syl)
   #This gives the number of syllable transitions
   for i in range(len(syl)-1):
      a = uniq_syl.index(syl[i])
      b = uniq_syl.index(syl[i+1])
      trans_matrix[a, b] += 1
   #Divide by total number of transitions
   trans_matrix = (trans_matrix.T/trans_matrix.sum(axis=1)).T
   trans_matrix = np.around(trans_matrix, 2)
   
   #To remove values less than 0.05 
   for i in range(len(trans_matrix)):
      for j in range(len(trans_matrix)):
         if trans_matrix[i, j] < 0.05:
            trans_matrix[i, j] = 0
   #Add the labels of syllables as an extra row and column
   syl_name = np.array(uniq_syl)
   trans_prob = np.concatenate([[syl_name], trans_matrix])

   #Add a '0' in the beginning to insert this as a column
   syl_name = np.concatenate([[0], syl_name])
   trans_prob = np.insert(trans_prob, 0, syl_name, axis = 1)

   transition_probability = pandas.DataFrame(trans_prob)
   fname_tp = save_dir_bout_tp + labels[file_idx] + '_bout2_trans_prob.csv'
   transition_probability.to_csv(fname_tp, header = False, index = False)

Calculating quantifiers from syllable transition matrix

In [12]:
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\Loop\\Bout_trans_prob_processed'

bout_tp_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        bout_tp_files.append(filename.path)

In [15]:
data = {
    'Bird name' : [],
    'Species' : [],
    'No. of Unique Syllables' : [],
    'Sequence Linearity' : [],
    'Sequence Consistency' : [],
    'Sequence stereotypy' : [],
    'Transition Entropy' : [],
    'No. of Unique Start Syll' : [],
    'Fraction of Unique Start Syll' : [],
    'No. of Unique transitions' : []
}

df = pandas.DataFrame(data)

df.to_csv('01. Song_quantifiers.csv', index=False)

In [17]:
#From the song-motif transition probability, this code calculates sequence linearity, consistency, stereotypy, 
#transition entropy and the number of initial starting syllables

for file_idx in range(len(bout_tp_files)):
    file = pandas.read_csv(bout_tp_files[file_idx], header=None)
    #Drop the row and column labels, just keep the numeric values
    trans_prob = file.iloc[1:, 1:]
    trans_prob = np.array(trans_prob)
    trans_prob = trans_prob.astype(float)

    #Get the set of unique syllables for the individual with their own index (so convert array to list)
    uniq_syl = file.iloc[0].to_numpy()
    uniq_syl = np.delete(uniq_syl, 0)

    #SEQUENCE LINEARITY
    #Number of unique syllables
    n_syl = len(uniq_syl)
    #To get the number of unique transitions, create an array with 1 whenever trans_prob > 0
    #and get the sum of the array
    trans_num = np.zeros_like(trans_prob)
    trans_num[trans_prob > 0] = 1
    n_trans = sum(map(sum, trans_num))
    s_lin = n_syl/n_trans

    #SEQUENCE CONSISTENCY
    #numerator - sum of typical transitions
    typ_trans = np.max(trans_prob, axis = 1)
    typ_sum = sum(typ_trans)
    #denominator - sum of all transitions
    trans_sum = sum(map(sum, trans_prob))
    s_con = typ_sum/trans_sum

    #SEQUENCE STEREOTYPY
    s_stereo = (s_lin + s_con)/2

    #TRANSITION ENTROPY
    trans_entropy = trans_prob
    for i in range(len(trans_entropy)):
        for j in range(len(trans_entropy)):
            trans_entropy[i, j] = -trans_entropy[i, j] * np.log2(trans_entropy[i, j])
    trans_entropy[np.isnan(trans_entropy)] = 0
    transition_entropy = sum(map(sum, trans_entropy))

    #To get the number of transitions from 'Start'
    syl_list = uniq_syl.tolist()
    s = syl_list.index('Start') #get the idex of 'Start' row
    #sum over Number of transitions matrix 'Start' row using the index
    n_start = np.sum(trans_num[s])

    data = {
    'Bird name' : [labels[file_idx]],
    'Species' : [sp_label[file_idx]],
    'No. of Unique Syllables' : [n_syl],
    'Sequence Linearity' : [s_lin],
    'Sequence Consistency' : [s_con],
    'Sequence stereotypy' : [s_stereo],
    'Transition Entropy' : [transition_entropy],
    'No. of Unique Start Syll' : [n_start],
    'Fraction of Unique Start Syll' : [n_start/n_syl],
    'No. of Unique transitions' : [n_trans]
    }
    df = pandas.DataFrame(data)

    df.to_csv('01. Song_quantifiers.csv', mode = 'a', index=False, header=False)

  trans_entropy[i, j] = -trans_entropy[i, j] * np.log2(trans_entropy[i, j])
  trans_entropy[i, j] = -trans_entropy[i, j] * np.log2(trans_entropy[i, j])
