In [1]:
import os
import numpy as np
import audiolabel
import pandas as pd
import parselmouth as pm
import re

Observation: in words with nasal vowels followed by oral consonants, such as V1 in /tako/+N => [tãko], sometimes we find an increasing nasal airflow with decreasing oral airflow, and sometimes too what appears to be a nasal stop. 

Goal: substatiate this observation with some measurements from the audio and airflow data.

Method: determine oral and nasal airflow during V productions, at -10%, 10%, 40%, 60%, 90%, and 110% of the segment as determined by the acoustics.

Dataframe should have the following columns:
- speaker
- file
- word
- repetition
- vowel
- t1
- t2
- oaf10pre
- oaf10
- oaf40
- oaf60
- oaf90
- oaf110
- naf10pre
- naf10
- naf40
- naf60
- naf90
- af110

Since we're (at this point) only interested in what's going on with vowels, we'll need to define a list of symbols representing vowels. This should be updated any time a new vowel appears!

In [2]:
vowels = ['U', 'ɨ','E','O','i','e','o','I','#']

Point the script to the folder where all the data is, and then use it to get a list of all of the TextGrids. We're assuming that there is a audio file for every textgrid, but not necessarily the other way around.

In [3]:
datadir = os.path.abspath('./maidata/')

In [4]:
tgs = [f for f in os.listdir(datadir) if f[-9:]=='.TextGrid']
tgs

['mai_nas_24may2017_emr_t_t+b+.TextGrid',
 'mai_nas_24may2017_emr_hUt+.TextGrid',
 'mai_nas_24may2017_emr_hEhOyi_2.TextGrid']

Now, loop through all the files! See inline comments.

In [5]:
# set up an empty dataframe to append new data to
alldf = pd.DataFrame(columns=['speaker','file','word','vowel','t1','t2','af1_10pre', 'af1_10', 'af1_40', 'af1_60',
       'af1_90', 'af1_110', 'af2_10pre', 'af2_10', 'af2_40', 'af2_60',
       'af2_90', 'af2_110'])

for t in tgs:
    
    # To handle naming weirdnesses, we're using regular expressions
    # mai_nas_DATE_(SPKR)_(WORD)(_ITER).TextGrid 
    p = re.compile('(mai_nas_[0-9]+[a-z]+[0-9]+_)([a-z]{3})_(\D+)((_\d)*\.)TextGrid')
    m = p.match(t)
    prefix = m.group(1)   # we'll need this later to reconstruct the wavfile name
    speaker = m.group(2)
    word = m.group(3)
    suffix = m.group(4)   # we'll need this too for wavfile name
    
    # in the special case where the word had a '#' in it that was messed up by Praat, fix it
    p = re.compile('_')
    word = p.sub('#', word)
    
    wavfile = ''.join([prefix,speaker+'_'+word,suffix,'wav'])  # figure out the filename for the audio file
    print(wavfile) # Print out the name of the audio file and textgrid for user to see where in loop process is
    print(t)

    # read in the textgrid, and save the first tier as phdf
    tg = audiolabel.LabelManager(from_file=os.path.join(datadir,t), from_type='praat')
    [phdf, wddf, trash] = tg.as_df()

    voweldf = phdf[phdf.text.isin(vowels)] # gets only vowels in phdf
    voweldf = voweldf.rename(columns={'text':'vowel'})

    # Define timepoints
    voweldf['t10pre'] = voweldf.apply(lambda x: x.t1-x.duration*.1, axis=1)
    voweldf['t10'] = voweldf.apply(lambda x: x.t1+x.duration*.1, axis=1)
    voweldf['t40'] = voweldf.apply(lambda x: x.t1+x.duration*.4, axis=1)
    voweldf['t60'] = voweldf.apply(lambda x: x.t1+x.duration*.6, axis=1)
    voweldf['t90'] = voweldf.apply(lambda x: x.t1+x.duration*.9, axis=1)
    voweldf['t110'] = voweldf.apply(lambda x: x.t1+x.duration*1.1, axis=1)

    # read in the wav file
    wav = pm.Sound(os.path.join(datadir,wavfile))
    [au, af1, af2] = wav.extract_all_channels()

    # measure values of af1 and af2
    voweldf['af1_10pre'] = voweldf.t10pre.apply(lambda x: af1.get_value(x))
    voweldf['af1_10'] = voweldf.t10.apply(lambda x: af1.get_value(x))
    voweldf['af1_40'] = voweldf.t40.apply(lambda x: af1.get_value(x))
    voweldf['af1_60'] = voweldf.t60.apply(lambda x: af1.get_value(x))
    voweldf['af1_90'] = voweldf.t90.apply(lambda x: af1.get_value(x))
    voweldf['af1_110'] = voweldf.t110.apply(lambda x: af1.get_value(x))

    voweldf['af2_10pre'] = voweldf.t10pre.apply(lambda x: af2.get_value(x))
    voweldf['af2_10'] = voweldf.t10.apply(lambda x: af2.get_value(x))
    voweldf['af2_40'] = voweldf.t40.apply(lambda x: af2.get_value(x))
    voweldf['af2_60'] = voweldf.t60.apply(lambda x: af2.get_value(x))
    voweldf['af2_90'] = voweldf.t90.apply(lambda x: af2.get_value(x))
    voweldf['af2_110'] = voweldf.t110.apply(lambda x: af2.get_value(x))

    # clean up!
    voweldf = voweldf.drop(columns=['duration','center','t10pre','t10','t40','t60','t90','t110'])
    voweldf['speaker'] = speaker
    voweldf['word'] = word
    voweldf['file'] = wavfile

    alldf = alldf.append(voweldf) # add the new data to the full df
    
alldf.to_csv('./maivowels.csv') # write the full df to a csv file

mai_nas_24may2017_emr_t#t+b+.wav
mai_nas_24may2017_emr_t_t+b+.TextGrid
mai_nas_24may2017_emr_hUt+.wav
mai_nas_24may2017_emr_hUt+.TextGrid
mai_nas_24may2017_emr_hEhOyi_2.wav
mai_nas_24may2017_emr_hEhOyi_2.TextGrid


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  voweldf['t10pre'] = voweldf.apply(lambda x: x.t1-x.duration*.1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  voweldf['t10'] = voweldf.apply(lambda x: x.t1+x.duration*.1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  voweldf['t40'] = voweldf.apply(lambda x: x.t1+x.duration*.4, axi

In [6]:
alldf

Unnamed: 0,speaker,file,word,vowel,t1,t2,af1_10pre,af1_10,af1_40,af1_60,af1_90,af1_110,af2_10pre,af2_10,af2_40,af2_60,af2_90,af2_110
2,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,#,0.803628,0.969288,0.000516,0.307184,0.16184,0.035314,0.046702,0.024785,0.054104,0.032882,0.098237,0.315134,0.093031,0.044652
4,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,ɨ,1.015651,1.12963,-0.031548,0.583145,0.177928,0.18727,0.105745,0.01767,0.034973,0.025752,0.02945,0.026847,0.018546,0.022657
6,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,ɨ,1.178057,1.308713,0.084248,0.269508,0.088904,0.154114,0.249138,0.284997,0.031135,0.03586,0.025892,0.014835,0.301773,0.488687
9,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,#,1.774577,1.929465,-0.012871,0.191436,0.118638,0.1033,0.023584,0.014603,0.043483,0.031047,0.0558,0.158818,0.222048,0.04955
11,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,ɨ,2.00394,2.126879,-0.02824,0.303384,0.097137,0.128387,0.11466,0.058071,0.037336,0.032642,0.03207,0.027324,0.013847,0.032085
13,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,ɨ,2.183639,2.251863,0.12776,0.115885,0.081926,0.065677,0.065247,0.053297,0.036674,0.041414,0.040819,0.04236,0.035717,0.035362
16,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,#,2.654057,2.815506,-0.025014,0.412147,0.336302,0.020628,0.014228,0.002099,0.038337,0.031639,0.031204,0.258694,0.286894,0.190318
18,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,ɨ,2.869724,2.967169,-0.002668,0.501796,0.145444,0.157544,0.163854,0.034043,0.10206,0.031357,0.03059,0.021274,0.009041,0.020619
20,emr,mai_nas_24may2017_emr_t#t+b+.wav,t#t+b+,ɨ,3.023717,3.104734,0.147623,0.18689,0.075641,0.056598,0.02219,-0.008017,0.03237,0.037464,0.037531,0.025961,0.005934,-0.005587
2,emr,mai_nas_24may2017_emr_hUt+.wav,hUt+,U,0.513778,0.690532,0.792231,0.571958,0.317748,0.320393,0.374175,0.018692,0.56937,0.439702,0.259571,0.266123,0.362978,0.07704
