This file has the code to obtain syllable transition values from Onset-Offset data, and process the file

Date: 20/9/2022

Problems:
1. Bouts are picked if gap > 2s and when there's a change in filename - should be modified if songbouts are cut up across files (BCC)
2. Rare/esoteric syllables have not been removed
3. 'End' -> 'Start' transition probability value is 1 in the output file. This needs to be changed when processing the Trans_prob file.

Major changes in ZF OnsetOffsetFiles - 
15 columns, instead of 13. So, ['Fundamental Frequency (Hz)', 'RMS Amplitude'] added at the end. Start and End row modified by adding two extra zeroes. Works for all other files as well.


In [26]:
import math
import pandas
import os
import numpy as np

In [37]:
# assign directory
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\OnsetOffsetFiles'

onset_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        onset_files.append(filename.path)

In [28]:
#Directories to save Start_end and Trans_prob files
save_dir_se = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Start_End\\'
save_dir_tp = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Trans_prob\\'

labels = ['JF_red28blu13', 'JF_ylw25gry11', 'ZF_grn21org41', 'ZF_org11pnk5', 'ZF_pnk93pnk91', 'ZF_red15ylw15', 'ZF_ylw95ylw29']
sp_label = ['JF', 'JF', 'ZF', 'ZF', 'ZF', 'ZF', 'ZF']


In [29]:
#Creates Start-End files from OnsetOffset files
for file_idx in range(len(onset_files)):
   data = pandas.read_csv(onset_files[file_idx], sep = '\t', header=None, 
      names = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])
   #Drop the first row
   data = data.iloc[1:, :]
   #Convert onset, offset and syllable duration to float type
   data[' Syll Onset (ms)'] = data[' Syll Onset (ms)'].astype(float)
   data[' Syll Offset (ms)'] = data[' Syll Offset (ms)'].astype(float)
   data[' Syll Duration (sec)'] = data[' Syll Duration (sec)'].astype(float)
   
   #Add a 'start' label in the beginning of the dataframe
   start_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   data = pandas.DataFrame(np.insert(data.values, 0, start_row, axis=0))
   #Give column labels again
   data.columns = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']
   
   #To add a 'start' and 'end' in syllable label column after end of every bout
   t_offset = data[' Syll Offset (ms)']
   new_row = [[0, 0, 'End', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
   
   #Create a list of indices where the labels should be inserted - gap > 2s or in between files
   index_list_bout = []
   for i in range(1, len(data)-1):
      diff = t_offset[i+1] - t_offset[i]
      if diff > 2000:
         index_list_bout.append(i)
   #print(index_list_bout)
   #Adjust the index to correct for where the row will be inserted
   for j in range(len(index_list_bout)):
      index_list_bout[j] += (2*j + 1)
   for k in index_list_bout:
      data = pandas.DataFrame(np.insert(data.values, k, new_row, axis=0))
   
   #Give column labels again
   data.columns = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']
   
   syll_idx = data['Syll #']
   index_list_file = []
   for i in range(1, len(data)-1):
      if syll_idx[i] == '1':
         index_list_file.append(i)
   #print(len(index_list_file))
   #'end-start' rows need to be inserted one step before data from new file begins
   ones = np.ones(len(index_list_file))
   index_list_file = index_list_file - ones
   #delete first index - to ignore the first "new" file
   index_list_file = index_list_file[1:]
   index_list_file = index_list_file.astype(int)

   for l in range(len(index_list_file)):
      index_list_file[l] += (2*l + 1)
   #Insert the new_row at these indices
   #PS. If this doesn't make sense, remove the correction, do a simple FOR loop and see what happens
   for m in index_list_file:
      data = pandas.DataFrame(np.insert(data.values, m, new_row, axis=0))

   #To add an 'End' label at the end
   end_row = [[0, 0, 'End', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
   data = pandas.DataFrame(np.insert(data.values, len(data), end_row, axis=0))
   
   #Give column labels again
   data.columns = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude']
   
   fname_se = save_dir_se + labels[file_idx] + '_start_end.csv'
   data.to_csv(fname_se, header = False, index = False)

In [35]:
directory = 'D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\test\\Start_End'

start_end_files = []

# iterate over files in
# that directory
for filename in os.scandir(directory):
    if filename.is_file():
        start_end_files.append(filename.path)

In [33]:
#For creating trans_prob files

#labels = ['JF_red28blu13', 'JF_ylw25gry11']

for file_idx in range(len(start_end_files)):
   data = pandas.read_csv(start_end_files[file_idx], header = None, 
      names = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])
   syl = data['Syll Label']
   #print(syl)
   #Get the set of unique syllables for the individual with their own index (so convert array to list)
   uniq_syl = syl.unique()
   uniq_syl = uniq_syl.tolist()
   #print(uniq_syl)
   #Get the number of syllables to create a N x N matrix for transition
   N = len(uniq_syl)
   trans_matrix = np.array(np.zeros((N, N), dtype = int))
   #Convert 'object' type to array to get indices
   syl = np.array(syl)
   #This gives the number of syllable transitions
   for i in range(len(syl)-1):
      a = uniq_syl.index(syl[i])
      b = uniq_syl.index(syl[i+1])
      trans_matrix[a, b] += 1
   #Divide by total number of transitions
   trans_matrix = (trans_matrix.T/trans_matrix.sum(axis=1)).T
   trans_matrix = np.around(trans_matrix, 2)
   
   #To remove values less than 0.05 
   for i in range(len(trans_matrix)):
      for j in range(len(trans_matrix)):
         if trans_matrix[i, j] < 0.05:
            trans_matrix[i, j] = 0
   #Add the labels of syllables as an extra row and column
   syl_name = np.array(uniq_syl)
   trans_prob = np.concatenate([[syl_name], trans_matrix])

   #Add a '0' in the beginning to insert this as a column
   syl_name = np.concatenate([[0], syl_name])
   trans_prob = np.insert(trans_prob, 0, syl_name, axis = 1)

   transition_probability = pandas.DataFrame(trans_prob)
   fname_tp = save_dir_tp + labels[file_idx] + '_transition_probability.csv'
   transition_probability.to_csv(fname_tp, header = False, index = False)

#Remember: Delete End->Start = 1 value from the file
#This file contains the low-occuring syllables as well

In [38]:
#To get the occurrence of number of syllables
#And confirm that this is the same as row_sum of trans_matrix
for file_idx in range(len(onset_files)):
    data = pandas.read_csv(onset_files[file_idx], sep = '\t', header=None, 
      names = ['FileName', 'Syll #', 'Syll Label',
      ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
      ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
      'AmplitudeModulation', 'EntropyVariance', 'Fundamental Frequency (Hz)', 'RMS Amplitude'])
    syl_n = data['Syll Label'].value_counts()
    print(onset_files[file_idx])
    print(syl_n)

D:\4th Year\Semester 7\BI4313 Sem Project\IN-comparative-analysis\IN-comparative-analysis\test\OnsetOffsetFiles\BCC_M01.txt
j             20
c             20
d             20
e             20
i             20
g             19
a             19
b             19
f             19
h             19
p             19
q             19
r             19
k             18
m             17
l             12
o              5
Syll Label     1
Name: Syll Label, dtype: int64
D:\4th Year\Semester 7\BI4313 Sem Project\IN-comparative-analysis\IN-comparative-analysis\test\OnsetOffsetFiles\BCC_M03.txt
i             123
k              80
m              44
o              38
p              23
g              23
h              19
n              19
f              17
a              17
b              17
c              17
e              17
d              16
q               2
r               2
s               1
Syll Label      1
Name: Syll Label, dtype: int64
D:\4th Year\Semester 7\BI4313 Sem Project\IN-comparative-ana