This file has the code to obtain syllable transition values from Onset-Offset data, and process the file

Date: 20/9/2022

In [55]:
import math
import pandas
import os
import numpy as np

In [56]:
#set directory
os.chdir('D:\\4th Year\\Semester 7\\BI4313 Sem Project\\IN-comparative-analysis\\IN-comparative-analysis\\OnsetOffsetFiles')

In [57]:
#Import text file as csv, choose header false and give column names manually, because they don't seem to align
#Use 'print(data.columns)' to get the list of column names
data = pandas.read_csv('M09.txt', sep = '\t', header=None, names = ['FileName', 'Syll #', 'Syll Label',
       ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
       ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
       'AmplitudeModulation', 'EntropyVariance'])

#Drop the first row because column names are repeated there
data = data.iloc[1:, :]

#Convert onset, offset and syllable duration to float type
data[' Syll Onset (ms)'] = data[' Syll Onset (ms)'].astype(float)
data[' Syll Offset (ms)'] = data[' Syll Offset (ms)'].astype(float)
data[' Syll Duration (sec)'] = data[' Syll Duration (sec)'].astype(float)
#data[' Syll Offset (ms)']

In [88]:

#t_offset = data[' Syll Offset (ms)']
#new_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

#for i in range(1, len(data)-1):
#    diff = t_offset[i+1] - t_offset[i]
#    if diff > 2000:
#        data = pandas.DataFrame(np.insert(data.values, i, new_row, axis=0))

#Problem - this code is not updating index number as it adds a 'start', so from the 2nd break onwards,
#the 'start' label is misplaced by one row. To correct this, I'll create a list of indices and insert
#the 'start' row using this list

In [58]:
#To add a 'start' in syllable label column after end of every bout

t_offset = data[' Syll Offset (ms)']
new_row = [0, 0, 'Start', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
index_list = []

for i in range(1, len(data)-1):
    diff = t_offset[i+1] - t_offset[i]
    if diff > 2000:
        index_list.append(i)

#So now we have a list of indices where 'start' row should be inserted
#Adjust the index to correct for where the row will be inserted
for i in range(1, len(index_list)):
    index_list[i] += 1*i

#Insert the new_row at these indices
#PS. If this doesn't make sense, remove the correction, do a simple for loop and see what happens
for i in index_list:
    data = pandas.DataFrame(np.insert(data.values, i, new_row, axis=0))

#To confirm
#print(data)
#data.to_csv('M09_tp.csv', header = False, index = False)

In [60]:
data.columns = ['FileName', 'Syll #', 'Syll Label',
       ' Syll Onset (ms)', ' Syll Offset (ms)', ' Syll Duration (sec)', ' Mean Frequency (Hz)',
       ' Entropy', 'Log Amplitude (dB)', 'Pitch Goodness', 'FrequencyModulation',
       'AmplitudeModulation', 'EntropyVariance']

#data

In [61]:
#To get the occurrence of number of syllables
#And confirm that this is the same as row_sum of trans_matrix

syl_n = data['Syll Label'].value_counts()

#syl_n

In [62]:
syl = data['Syll Label']
#print(syl)

#Get the set of unique syllables for the individual with their own index (so convert array to list)
uniq_syl = syl.unique()
uniq_syl = uniq_syl.tolist()
#print(uniq_syl)

#Get the number of syllables to create a N x N matrix for transition
N = len(uniq_syl)
trans_matrix = np.array(np.zeros((N, N), dtype = int))

#uniq_syl.index('a')


In [63]:

#Convert 'object' type to array to get indices
syl = np.array(syl)

#This gives the number of syllable transitions
for i in range(len(syl)-1):
    a = uniq_syl.index(syl[i])
    b = uniq_syl.index(syl[i+1])
    trans_matrix[a, b] += 1

#trans_matrix

In [64]:
#Divide the number of transitions by total number of occurrence of that syllable to get transition probability
trans_matrix = (trans_matrix.T/trans_matrix.sum(axis=1)).T
trans_matrix = np.around(trans_matrix, 2)

#print(np.sum(trans_matrix[0,]))
#print(trans_matrix)

In [65]:
#Add the labels of syllables as an extra row and column
#While doing this, it's better to replace 'start' here with 'end' so that notes don't connect back to 'start'
syl_name = np.array(uniq_syl)

trans_prob = np.concatenate([[syl_name], trans_matrix])
#trans_prob

In [66]:
#Add a '0' in the beginning to insert this as a column
syl_name = np.concatenate([[0], syl_name])

trans_prob = np.insert(trans_prob, 0, syl_name, axis = 1)
#trans_prob

#Convert to dataframe to save as .csv file
transition_probability = pandas.DataFrame(trans_prob)
transition_probability.to_csv('M09_trans_prob.csv', header = False, index = False)

#There are values lesser than 0.05 that can be removed through an IF loop perhaps
#But we can keep the values and code the diagram to not show them
#This file contains the low-occuring syllables as well