This code loops over all files and cuts up the whole file into song bouts, so they can be filtered based on different criteria.
09/11/2022

In [1]:
import math
import pandas
import os
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from functools import reduce

In [3]:
#Function to find the indices of all the occurrence of 'End' in Syllable label
def find_end_indices(list, element):
    indices = [0]
    for idx, value in enumerate(list):
        if value == element:
            indices.append(idx)
    return indices

In [4]:
#Define a function to pick song bouts that are long in length
#Returns a list of data frames

def motif_finder(bouts_list):
    song_w_motif = bouts_list.copy()
    syll_num = []
    short_song_idx = []
    for i in range(len(bouts_list)):
        bout = bouts_list[i]
        syllable_list = bout['Syll Label'].tolist()
        syll_num.append(len(syllable_list))
    avg_syl_num = np.mean(syll_num)
    for i in range(len(syll_num)):
        if syll_num[i] < avg_syl_num/3:
            short_song_idx.append(i)
    #as each index is deleted, the list shortens, so indices have to be updated
    for i in range(len(short_song_idx)):
        short_song_idx[i] -= i
    for j in short_song_idx:
        del song_w_motif[j]
    return song_w_motif

In [5]:
#Combines all the dataframes in a list

def concatenate_bout_list(bout_list):
    new_df = pandas.DataFrame()
    for i in range(len(bout_list)):
        new_df = pandas.concat([new_df, bout_list[i]])
    start_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    new_df = pandas.DataFrame(np.insert(new_df.values, 0, start_row, axis=0))
    return new_df

In [13]:
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Start_End'

onset_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        onset_files.append(filename.path)

In [16]:
onset_files

['D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Start_End\\BCC_M01_start_end.csv',
 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Start_End\\BCC_M03_start_end.csv',
 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Start_End\\JF_ylw22gry08_start_end.csv']

In [17]:
save_dir_sb = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Bouts_Start_End\\'

labels = ['BCC_M01', 'BCC_M03', 'JF_ylw22gry08']

for file_idx in range(len(onset_files)):
    data = pandas.read_csv(onset_files[file_idx], header=None,
        names = ['FileName', 'Syll #', 'Syll Label',
        ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
        ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
        'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])
    
    syl = data['Syll Label']
    all_syl = syl.tolist()
    end_indices = find_end_indices(all_syl, 'End')

    song_bouts = []
    #This method of slicing includes 'Start' and 'End'
    for i in range(len(end_indices)-1):
        x = data.iloc[end_indices[i] + 1 : end_indices[i+1] + 1, :]
        song_bouts.append(x)

    song_motif = motif_finder(song_bouts)

    motif_bouts = concatenate_bout_list(song_motif)
    motif_bouts.columns = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']

    fname_sb = save_dir_sb + labels[file_idx] + '_bouts_start_end.csv'
    motif_bouts.to_csv(fname_sb, header=False, index=False)


Calculate everything for those files with song motifs (song bouts with sufficiently high number of bouts)

In [19]:
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Bouts_Start_End'

song_bout_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        song_bout_files.append(filename.path)


In [21]:
save_dir_tp = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Trans_prob\\'

for file_idx in range(len(song_bout_files)):
   data = pandas.read_csv(song_bout_files[file_idx], header=None,
      names = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])

   syl = data['Syll Label']
   #Get the set of unique syllables for the individual with their own index (so convert array to list)
   uniq_syl = syl.unique()
   uniq_syl = uniq_syl.tolist()
   #Get the number of syllables to create a N x N matrix for transition
   N = len(uniq_syl)
   trans_matrix = np.array(np.zeros((N, N), dtype = int))
   #Convert 'object' type to array to get indices
   syl = np.array(syl)
   #This gives the number of syllable transitions
   for i in range(len(syl)-1):
      a = uniq_syl.index(syl[i])
      b = uniq_syl.index(syl[i+1])
      trans_matrix[a, b] += 1
   #Divide by total number of transitions
   trans_matrix = (trans_matrix.T/trans_matrix.sum(axis=1)).T
   trans_matrix = np.around(trans_matrix, 2)
   
   #To remove values less than 0.05 
   for i in range(len(trans_matrix)):
      for j in range(len(trans_matrix)):
         if trans_matrix[i, j] < 0.05:
            trans_matrix[i, j] = 0
   #Add the labels of syllables as an extra row and column
   syl_name = np.array(uniq_syl)
   trans_prob = np.concatenate([[syl_name], trans_matrix])

   #Add a '0' in the beginning to insert this as a column
   syl_name = np.concatenate([[0], syl_name])
   trans_prob = np.insert(trans_prob, 0, syl_name, axis = 1)

   transition_probability = pandas.DataFrame(trans_prob)
   fname_tp = save_dir_tp + labels[file_idx] + '_bout_trans_prob.csv'
   transition_probability.to_csv(fname_tp, header = False, index = False)