# duration data analysis

TODO:

- DONE walk the directory with textgrids
- DONE extract textgrids
- DONE collect all instances of vowels with contexts and speaker names
- find and correct spelling errors in words
- split palatalized and non-palatalized contexts

In [None]:
import textgrid
import numpy as np
import pandas as pd
import re
import os, sys
from time import strftime, localtime

import gspread
from oauth2client.service_account import ServiceAccountCredentials

import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


## reading files

In [None]:
ROOT_PATH = '../misc/'
AUDIO_TEXTGRID_PATH = '../misc/audio/' # wav + TextGrid
FORMANT_PATH = '../misc/formants/' # csv formant data

In [None]:
# rename files to swap spaces for underscores

for path, subdirs, files in os.walk('./misc/audio/'):
    for name in files:
        fn = os.path.join(path, name)
#         print(fn)
        new_fn = re.sub(' ', '_', fn)
#         print(new_fn)
#         print()
        os.rename(fn, new_fn)

In [None]:
! rm -r ./misc/formants/.DS_Store

In [None]:
# read csv outputs

# df = pd.concat([
#     pd.read_csv(os.path.join(FORMANT_PATH, fn)) for fn in os.listdir(FORMANT_PATH)
# ]).dropna(subset='phoneme').reset_index(drop=True)

df = pd.read_csv(os.path.join(FORMANT_PATH, 'output_20240723.csv'), delimiter='\t')

# extract means from formants data

for col in ['f0', 'f1', 'f2', 'f3']:
    df[col.upper()] = df[col].apply(lambda x: np.array([float(val) for val in x[2:-1].split(',')]).mean())
    
df = df[['filename', 'word', 'segment', 'vowelIntervalNum', 'wordIntervalNum',
       'duration', 'f0max', 'f0min', 'intensity', 'intensity_max', 'F0', 'F1', 'F2', 'F3']]
df.head(2)

In [None]:
mrg_df = df.dropna(subset='word')

In [None]:
# this is how many contexts we'll have to resolve
(mrg_df[['word', 'segment']].value_counts() > 1).sum()

In [None]:
mrg_df['word'] = mrg_df['word'].apply(lambda x: x.split())
mrg_df = mrg_df.explode('word').reset_index(drop=True)
mrg_df['word'] = mrg_df['word'].apply(lambda x: 'ʔ' + x if x[0] in 'aeoiuæăĕŏĭŭ°æy' else x)

In [None]:
mrg_df.shape
# mrg_df.head()

In [None]:
# fill NaN's automatically when possible

consonants = 'mḿnńŋŋ́pṕtčkʔʰsšxx́λlwẃjd' + 'bcfghqrv'
vowels = 'aeoiuæăĕŏĭŭ°æ' + 'y'
vowels_short = 'ăĕŏĭŭ°æ̆'
vowels_long = 'aeoiuæ'

def parse_syllables(word):
    if word[0] in vowels:
        word = 'ʔ' + word # no vowel-initial words
    cv_mask = ''
    one2one_parse = ''
    # create a CV-mask
    for seg in word:
        if seg in '́ ʹ':
            continue
        elif seg in consonants:
            cv_mask += 'C'
        elif seg in vowels_short:
            cv_mask += 'v'
        elif seg in vowels_long:
            cv_mask += 'V'
        one2one_parse += seg
        
    # split into syllables
    nucleus = False
    coda = False
    onset = False
    syllables = list()
    syll, seg_syll = '', ''
    for slot, seg in zip(cv_mask[::-1], one2one_parse[::-1]):
        syll += slot.replace('V', 'VV').replace('v', 'V')
        seg_syll += seg
        if slot == 'C':
            if nucleus:
                syllables.append((syll[::-1], seg_syll[::-1]))
                syll, seg_syll = '', ''
                nucleus, coda, onset = False, True, False
        else:
            nucleus = True
    return syllables[::-1]
                
    
words = ['ańa', 'xălakuhkon°tă', 'kăpčaḿṕoš°tu', 'tŭ', 'tol°', 'paŋk', 'taŋksa', 'pĭt kaλ´a'] 
print(*[parse_syllables(word) for word in words], sep='\n\n')

In [None]:
def determine_context(syllables, vowel):
    """
    get the first occurrence of vowel and return the context characteristics
    takes syllables of form [(cv_mask, segments)*]
    returns a tuple of (syllable structure, syllable count, position, stress, vowel)
    """
    target_syll = None
    for idx, (syll, seg_syll) in enumerate(syllables):
        if vowel in seg_syll:
            target_syll, target_seg_syll = syll, seg_syll
            target_idx = idx
            break
    # return empty spaces if the vowel is not found
    if target_syll == None:
        return ('', '', '', '', '', '')
    
    syllable_structure = target_syll
    syllable_count = 'monosyllable' if len(syllables) == 1 else 'polysyllabic'
    
    if syllable_count == 'monosyllable':
        position = 'final'
    else:
        if target_idx == 0:
            position = 'initial'
        elif target_idx == len(syllables) - 1:
            position = 'final'
        else:
            position = 'medial'
            
    stress = 'stressed' if idx % 2 == 0 and (position != 'final' or syllable_count == 'monosyllable') else 'unstressed'
    if vowel in 'aă':
        vowel = 'low'
    elif vowel in 'uŭiĭ':
        vowel = 'high'
    elif vowel in 'eĕoŏææ̆':
        vowel = 'mid'
        
    if position != 'final':
        if '°' in syllables[target_idx + 1][1]:
            pre_schwa = 'yes'
        else:
            pre_schwa = 'no'
    else:
        pre_schwa = 'no'
        
    return syllable_structure, syllable_count, position, stress, vowel, pre_schwa

In [None]:
def determine_contexts(syllables, vowels, indices):
    """
    vowels: list of (V, List[str, float])
    get the first occurrence of vowel and return the context characteristics
    takes syllables of form [(cv_mask, segments)*]
    returns a tuple of (syllable structure, syllable count, position, stress, vowel)
    """
    target_syll = None
    out = list()
    
    def get_context(vowel, target_syll, target_seg_syll, target_idx):
        
        if target_syll == None:
            return ('', '', '', '', '', '')

        syllable_structure = target_syll
        syllable_count = 'monosyllable' if len(syllables) == 1 else 'polysyllabic'

        if syllable_count == 'monosyllable':
            position = 'final'
        else:
            if target_idx == 0:
                position = 'initial'
            elif target_idx == len(syllables) - 1:
                position = 'final'
            else:
                position = 'medial'

        stress = 'stressed' if idx % 2 == 0 and (position != 'final' or syllable_count == 'monosyllable') else 'unstressed'
        if vowel in 'aă':
            vowel = 'low'
        elif vowel in 'uŭiĭ':
            vowel = 'high'
        elif vowel in 'eĕoŏææ̆':
            vowel = 'mid'

        if position != 'final':
            if '°' in syllables[target_idx + 1][1]:
                pre_schwa = 'yes'
            else:
                pre_schwa = 'no'
        else:
            pre_schwa = 'no'        

        return syllable_structure, syllable_count, position, stress, vowel, pre_schwa
    
    while len(vowels) > 0:
        for idx, (syll, seg_syll) in enumerate(syllables):
            for i, vowel in enumerate(vowels):
#                 print(vowels, indices)
                if vowel not in ''.join([s[1] for s in syllables]):
#                     print()
                    vowels.pop(i)
                    indices.pop(i)
                if vowel in seg_syll:
#                     print(vowel, seg_syll)
                    target_syll, target_seg_syll = syll, seg_syll
                    target_idx = idx
                    out.append(
                        [vowel,
                         indices[i],
                        get_context(vowel, target_syll, target_seg_syll, target_idx)]
                    )
                    vowels.pop(i)
                    indices.pop(i)
                    break
                            
    return out

In [None]:
words = ['xălakuhkon°tă', 'kăpčaḿṕoš°tu', 'tŭ', 'tol°', 'paŋk', 'aŋksa'] 
vowels = [['ă', 'a', 'u', 'o', 'ă'], ['o', 'a'], ['ŭ', 'ŭ', 'ŭ'], ['o'], ['e'], ['a', 'a'],]
indices = [[1, 2, 3, 4, 5], [1, 2], [2, 2, 2], [1], [2], [444, 44]]

for w, v, i in zip(words, vowels, indices):
    print(parse_syllables(w))
    print(*determine_contexts(parse_syllables(w), v, i), sep='\n\n')
    print('================')

In [None]:
mrg_df_filled = mrg_df.copy()
for idx, row in mrg_df.iterrows():
    syllable_structure, syllable_count, position, stress, vowel, pre_schwa = determine_context(
        parse_syllables(
            row.word
        ), row.segment
    )
#     mrg_df_filled.loc[idx, ['word', 'segment']] =
    mrg_df_filled.loc[idx, ['nenets', 
                            'syllable structure', 
                            'syllable count', 
                            'position', 
                            'stress', 
                            'vowel',
                            'pre-schwa']
                     ] = row.word, syllable_structure, syllable_count, position, stress, vowel, pre_schwa
mrg_df_filled['consultant'] = mrg_df_filled['filename'].apply(lambda x: x.split('_')[-1])

In [None]:
mrg_df_grouped = pd.DataFrame(mrg_df_filled.groupby(['filename', 'word',])['segment'].apply(lambda x: list(x))).reset_index() 
mrg_df_grouped['indeces'] = mrg_df_filled.reset_index().groupby(['filename', 'word'])['index']\
    .apply(lambda x: list(x)).values
# mrg_df_grouped
res = []

# 'vowelIntervalNum', 'wordIntervalNum',

for idx, row in mrg_df_grouped.iterrows():
    res.append(
        determine_contexts(
            parse_syllables(
                row.word
            ), row.segment, row.indeces
        )
    )

In [None]:
mrg_df_grouped

In [None]:
mrg_df_grouped['out'] = res
mrg_df_grouped_expl = mrg_df_grouped.explode(['out'])
mrg_df_grouped_expl[['segment', 'indeces', 'context',]] = mrg_df_grouped_expl['out'].apply(pd.Series).values
mrg_df_grouped_expl[['syllable structure', 
                    'syllable count', 
                    'position', 
                    'stress', 
                    'vowel',
                    'pre-schwa']] = mrg_df_grouped_expl['context'].apply(pd.Series).values
mrg_df_grouped_expl = mrg_df_grouped_expl.drop(columns=['out', 'context'])
mrg_df_grouped_expl['consultant'] = mrg_df_grouped_expl['filename'].apply(lambda x: x.split('_')[-1])
# mrg_df_grouped_expl.indeces.value_counts()
# mrg_df_grouped_expl = mrg_df_grouped_expl.drop(['vowelIntervalNum', 'wordIntervalNum']).reset_index(drop=True)

In [None]:
mrg_df_grouped_expl.loc[mrg_df_grouped_expl.word == 'kata']

In [None]:
mrg_df_filled = mrg_df_grouped_expl.merge(mrg_df, left_on='indeces', right_index=True, suffixes=('', '_DROP'))
mrg_df_filled = mrg_df_filled.drop(columns=list(filter(lambda x: '_DROP' in x, mrg_df_filled.columns)))

In [None]:
# this is how many contexts we'll have to add by hand
# now that's better
(mrg_df_filled['word'] == '').sum()

In [None]:
mrg_df_filled['syllable structure'].value_counts()

In [None]:
# remove parsing errors

mrg_df_filled = mrg_df_filled.loc[~mrg_df_filled['syllable structure'].isin(['CVCC', 'CVCC', 'CVVCCC'])]

## let's draw

In [None]:
mrg_df_filled.sample()

In [None]:
print(mrg_df_filled.shape)
mrg_df_filled.loc[mrg_df_filled.vowel != ''].drop_duplicates(subset=['word', 'segment', 'filename']).shape

In [None]:
data = mrg_df_filled.loc[(mrg_df_filled.vowel != '') & (mrg_df_filled['syllable count'] == 'monosyllable')].drop_duplicates(subset=['word', 'segment', 'filename'])
data.F1 = data.F1.astype(float)
data.F2 = data.F2.astype(float)

In [None]:
# remove outliers by duration
# data = data.loc[(data.duration > data.duration.quantile(.1)) & (data.duration < data.duration.quantile(.9))]

# sample certain vowels
# data = data.loc[data.vowel == 'mid']

# just AOK
data = data.loc[
#         (data['syllable count'] == 'polysyllabic') & \
#         (data.vowel == 'low') & \
        (~data.segment.isin(['e'])) & \
        (data.consultant == 'AOK')
]

# leave out tsAYuU
# data = data.loc[(~data.file.str.contains('_tsAYuU')) & (data.position == 'initial')]

data.sample()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

fig, ax = plt.subplots(figsize=(10,8))

x_name = 'F2'
y_name = 'F1'

x = data[x_name]
y = data[y_name]

ax.scatter(x, y)
plt.show()

In [None]:
cmap = cm.get_cmap('Dark2')

fig, ax = plt.subplots(figsize=(10,8))

x_name = 'F2'
y_name = 'F1'

x = data[x_name]
y = data[y_name]

ax.scatter(x, y,marker="")

for v, color in zip(data.segment.unique(),cmap.colors):
    X = data[x_name].loc[data.segment == v]
    Y = data[y_name].loc[data.segment == v]
    for x, y in zip(X,Y):
        ax.annotate(v,(x,y),fontsize=14,color=color)

ax.invert_xaxis()
ax.invert_yaxis()
ax.set_xlabel(x_name,fontsize=16)
ax.set_ylabel(y_name,fontsize=16)
ax.yaxis.tick_right()
ax.xaxis.tick_top()
ax.yaxis.set_label_position("right")
ax.xaxis.set_label_position("top")
ax.set_title('Vowels',fontsize=18)
#ax.grid()
#plt.savefig('my_vowel_plot.png')
plt.show()


In [None]:
# tease apart outliers
data.loc[(data.F2 > 1300) & (data.F1 > 400) & (data.segment == 'ŭ')]

In [None]:
# plot duration distributions

In [None]:
a_data = mrg_df_filled.loc[
        (mrg_df_filled['syllable count'] == 'polysyllabic') & \
        (mrg_df_filled.vowel == 'low') & \
#         (mrg_df_filled.position != 'final') & \
        (mrg_df_filled.consultant == 'AOK')]\
    .groupby(['syllable structure', 'position', 'stress'])['duration'].mean().round(5) * 1000

a_data = pd.DataFrame(a_data)
a_data['std'] = (mrg_df_filled.loc[(mrg_df_filled['syllable count'] == 'polysyllabic') & (mrg_df_filled.vowel == 'low')]\
    .groupby(['syllable structure', 'position', 'stress'])['duration'].std().round(5) * 1000)

In [None]:
a_data

In [None]:
plot_data.iloc[0].values

In [None]:
# non-final length distributions

plot_data = mrg_df_filled.loc[
    (mrg_df_filled['syllable count'] == 'polysyllabic') & \
    (mrg_df_filled.vowel != 'mid') & \
    (mrg_df_filled['pre-schwa'] == 'no') & \
    (mrg_df_filled.position != 'final')
]
plot_data.loc[len(plot_data)] = ['', '', 'ă', 0, 'CVC',
       'polysyllabic', 'medial', 'unstressed', 'low', 'no', 'AOK', 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0,]

g = sns.FacetGrid(plot_data,
                  row='position', col='syllable structure', margin_titles=True, height=3, aspect=1,)
g.map(sns.barplot, 'stress', "duration", palette="Set1")
g.add_legend()
plt.show()

In [None]:
# final length

plot_data = mrg_df_filled.loc[
    (mrg_df_filled['syllable count'] == 'polysyllabic') & \
    (mrg_df_filled.vowel == 'low') & \
    (mrg_df_filled['syllable structure'] != 'CVVCC') & \
    (mrg_df_filled.position == 'final')
]

g = sns.FacetGrid(plot_data,
                  row='vowel', col='syllable structure', margin_titles=True, height=3, aspect=1,)
g.map(sns.barplot, 'stress', "duration", palette='pastel')
g.add_legend()
plt.show()

In [None]:
plot_data.groupby(['syllable structure', 'stress', 'vowel'])['duration'].mean()
# plot_data.loc[plot_data['syllable structure'] == 'CVVC'].sort_values('duration', ascending=False)

In [None]:
# monosyllables by vowel quality

plot_data = mrg_df_filled.loc[
    (mrg_df_filled['syllable count'] == 'monosyllable') & \
    (mrg_df_filled['syllable structure'] != 'CVVCC') & \
    (mrg_df_filled['syllable structure'] != 'CVCC')
]
plot_data.loc[len(plot_data)] = ['', '', 0, 0, 'ă', 0, 'CV',
       'monosyllabic', 'medial', 'unstressed', 'low', 'no', 'AOK',
       0, 0, 0, 0, 0, 0, 0, 0, 0,]

g = sns.FacetGrid(plot_data,
                  col='syllable structure', margin_titles=True, height=3, aspect=1,)
g.map(sns.barplot, 'vowel', "duration", palette='pastel6')
g.add_legend()
plt.show()

In [None]:
# pre-schwa unstressed vs stressed

plot_data = mrg_df_filled.loc[
    (mrg_df_filled['syllable count'] == 'polysyllabic') & \
    (mrg_df_filled['syllable structure'] != 'CVVCC') & \
    (mrg_df_filled['syllable structure'] != 'CVCC')
]
# plot_data.loc[len(plot_data)] = ['', '', 0, 0, 'ă', 0, 'CVC',
#        'monosyllabic', 'medial', 'unstressed', 'low', 'yes', 'AOK',
#        0, 0, 0, 0, 0, 0, 0, 0, 0,]
plot_data.loc[len(plot_data)] = ['', '', 'ă', 0, 'CVVC',
       'polysyllabic', 'medial', 'unstressed', 'low', 'yes', 'AOK', 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0,]

g = sns.FacetGrid(plot_data,
                  col='syllable structure', row='stress', margin_titles=True, height=3, aspect=1,)
g.map(sns.barplot, 'pre-schwa', "duration", palette='flare')
g.add_legend()
plt.show()

In [None]:
# monosyllabic vs polysyllabic short

plot_data = mrg_df_filled.loc[
    (
        mrg_df_filled['syllable structure'].isin(['CVC', 'CV']) & \
        (mrg_df_filled['syllable count'] == 'polysyllabic') & \
        (mrg_df_filled['stress'] == 'stressed')
    )
    | \
    (
        (mrg_df_filled['syllable count'] == 'monosyllable') & \
        mrg_df_filled['syllable structure'].isin(['CVC', 'CV'])
    )
]
# plot_data.loc[len(plot_data)] = ['', '', 0, 0, 'ă', 0, 'CVC',
#        'monosyllabic', 'medial', 'unstressed', 'low', 'yes', 'AOK',
#        0, 0, 0, 0, 0, 0, 0, 0, 0,]
# plot_data.loc[len(plot_data)] = ['', '', 'ă', 0, 'CVVC',
#        'polysyllabic', 'medial', 'unstressed', 'low', 'yes', 'AOK', 0, 0,
#        0, 0, 0, 0, 0, 0, 0, 0, 0,]

g = sns.FacetGrid(plot_data,
                  col='syllable structure', row='vowel', margin_titles=True, height=3, aspect=1,)
g.map(sns.barplot, 'syllable count', "duration", palette='flare')
g.add_legend()
plt.show()

In [None]:
# check specific word

word = 'dʹa'
print('\n', word)

plot_data = mrg_df_filled.loc[
    (mrg_df_filled['word'] == word)
]

g = sns.FacetGrid(plot_data,
                  row='syllable structure', col='stress', margin_titles=True, height=3, aspect=1)
g.map(sns.barplot, 'pre-schwa', "duration", palette='flare')
g.add_legend()
plt.show()

In [None]:
# draw a duration table for every vowel in the word

def get_word_duration_table(word, round_factor=5, columns_to_add=['position', 'stress']):
#  & (~mrg_df_filled.filename.str.contains('kăm')
    grpb_object = mrg_df_filled.loc[(mrg_df_filled.word == word)].groupby([
        'word', 'segment',
        'syllable structure',
        'position', 'stress', 
        'vowel', 'pre-schwa',
    ])['duration']
    mean, std, count = grpb_object.mean(), grpb_object.std(), grpb_object.count()
    table = mean.rename('mean, ms').to_frame()\
        .join(std.rename('std, ms'))\
        .round(round_factor) * 1000
    table = table.join(count.rename('count'))
    table = table.reset_index()[['word', 'segment', 'mean, ms', 'std, ms', 'count'] + columns_to_add]
    print(table, end='\n\n')
    print(table.to_latex(index=False))
    
get_word_duration_table('kemta')