This file has the code to obtain syllable transition values from Onset-Offset data, and process the file

Date: 20/9/2022

Problems:
1. Bouts are picked if > 2s or lesser than -50s (which is based on BCC songs). This needs to be modified for each individual, and won't work if a song is cut up in between
2. Rare/esoteric syllables have not been removed
3. 'End' -> 'Start' transition probability value is 1 in the output file. This needs to be changed

In [1]:
import math
import pandas
import os
import numpy as np

In [2]:
#set directory
os.chdir('D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\OnsetOffsetFiles')

In [214]:
#Import text file as csv, choose header false and give column names manually, because they don't seem to align
#Use 'print(data.columns)' to get the list of column names
data = pandas.read_csv('ZF_ylw95ylw29.txt', sep = '\t', header=None, names = ['FileName', 'Syll #', 'Syll Label',
       ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
       ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
       'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])

#Drop the first row because column names are repeated there
data = data.iloc[1:, :]

#Convert onset, offset and syllable duration to float type
data[' Syll Onset (ms)'] = data[' Syll Onset (ms)'].astype(float)
data[' Syll Offset (ms)'] = data[' Syll Offset (ms)'].astype(float)
data[' Syll Duration (sec)'] = data[' Syll Duration (sec)'].astype(float)
#data[' Syll Offset (ms)']

#Add a 'start' label in the beginning of the dataframe
start_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
data = pandas.DataFrame(np.insert(data.values, 0, start_row, axis=0))

#Give column labels again
data.columns = ['FileName', 'Syll #', 'Syll Label',
       ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
       ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
       'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']

Major changes in ZF OnsetOffsetFiles - 
15 columns, instead of 13. So, ['Fundamental Frequency (Hz)', 'RMS Amplitude'] added at the end. Start and End row modified by adding two extra zeroes.


In [215]:

#t_offset = data[' Syll Offset (ms)']
#new_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

#for i in range(1, len(data)-1):
#    diff = t_offset[i+1] - t_offset[i]
#    if diff > 2000:
#        data = pandas.DataFrame(np.insert(data.values, i, new_row, axis=0))

#Problem - this code is not updating index number as it adds a 'start', so from the 2nd break onwards,
#the 'start' label is misplaced by one row. To correct this, I'll create a list of indices and insert
#the 'start' row using this list

In [216]:
#To add a 'start' and 'end' in syllable label column after end of every bout

t_offset = data[' Syll Offset (ms)']
new_row = [[0, 0, 'End', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

#Create a list of indices where the labels should be inserted
#If the condition was just 'diff > 2000', the songs that start in the next file are not picked up
#So, add the condition that there's a break when it moves from one file to another
#Problem: This wouldn't work for songs that are split up across files

#Use 'diff < - 50000' for BCC, because most of their songs are contained within files.

index_list = []
for i in range(1, len(data)-1):
    diff = t_offset[i+1] - t_offset[i]
    if diff > 2000 or diff < -3000:
        index_list.append(i)
#print(index_list)

#So now we have a list of indices where 'start' row should be inserted
#Adjust the index to correct for where the row will be inserted
#'2i' because we're adding new rows, and '+1' because of the initial 'start' row
for i in range(len(index_list)):
    index_list[i] += (2*i + 1)

#Insert the new_row at these indices
#PS. If this doesn't make sense, remove the correction, do a simple for loop and see what happens
for i in index_list:
    data = pandas.DataFrame(np.insert(data.values, i, new_row, axis=0))

#To add an 'End' label at the end
end_row = [[0, 0, 'End', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
data = pandas.DataFrame(np.insert(data.values, len(data), end_row, axis=0))

#Give column labels again
data.columns = ['FileName', 'Syll #', 'Syll Label',
       ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
       ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
       'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']

#To confirm
#print(data)
#data.to_csv('BCC_M21_trans_prob_start_end.csv', header = False, index = False)

In [217]:
#To get the occurrence of number of syllables
#And confirm that this is the same as row_sum of trans_matrix

syl_n = data['Syll Label'].value_counts()

syl_n

i        865
a        328
b        325
c        310
d        263
j        117
Start    105
End      105
L         79
k         38
s         15
p         10
Z          6
n          2
S          1
x          1
y          1
K          1
Name: Syll Label, dtype: int64

In [218]:
syl = data['Syll Label']
#print(syl)

#Get the set of unique syllables for the individual with their own index (so convert array to list)
uniq_syl = syl.unique()
uniq_syl = uniq_syl.tolist()
#print(uniq_syl)

#Get the number of syllables to create a N x N matrix for transition
N = len(uniq_syl)
trans_matrix = np.array(np.zeros((N, N), dtype = int))

#uniq_syl.index('a')


In [219]:

#Convert 'object' type to array to get indices
syl = np.array(syl)

#This gives the number of syllable transitions
for i in range(len(syl)-1):
    a = uniq_syl.index(syl[i])
    b = uniq_syl.index(syl[i+1])
    trans_matrix[a, b] += 1

#trans_matrix

In [220]:
#Divide the number of transitions by total number of occurrence of that syllable to get transition probability
trans_matrix = (trans_matrix.T/trans_matrix.sum(axis=1)).T
trans_matrix = np.around(trans_matrix, 2)

#print(np.sum(trans_matrix[0,]))
#print(trans_matrix)

In [221]:
#To remove values less than 0.05 
#Skip this if you want to keep all the values

#SKIP FOR BENGALESE FINCHES
#For Bengalese finches, the syllables repeat so many times in a song that trans_prob to 'End' is always < 0.05

for i in range(len(trans_matrix)):
    for j in range(len(trans_matrix)):
        if trans_matrix[i, j] < 0.05:
            trans_matrix[i, j] = 0

In [222]:
#Add the labels of syllables as an extra row and column
#While doing this, it's better to replace 'start' here with 'end' so that notes don't connect back to 'start'
syl_name = np.array(uniq_syl)

trans_prob = np.concatenate([[syl_name], trans_matrix])
#trans_prob

In [223]:
#Add a '0' in the beginning to insert this as a column
syl_name = np.concatenate([[0], syl_name])

trans_prob = np.insert(trans_prob, 0, syl_name, axis = 1)
#trans_prob

#Convert to dataframe to save as .csv file - make sure to change the name!
transition_probability = pandas.DataFrame(trans_prob)
transition_probability.to_csv('ZF_ylw95ylw29_transition_probability.csv', header = False, index = False)

#Remember: Delete End->Start = 1 value from the file
#This file contains the low-occuring syllables as well