In [1]:
import parselmouth as ps
from parselmouth.praat import call as pcall  
import audiolabel
import pandas as pd
import os

## Data setup
This notebook assumes that the data files are located in a subfolder called 'data', further divided by speaker IDs. Let's create a couple of variables to store this information so we can access them quickly. We'll make them variables so that it is easy to change if needed.

In [2]:
datadir = os.path.abspath('./data') # os.path.abspath provides the full, or "absolute" path for the folder in question
os.listdir(datadir) # os.listdir provides a list of the files in a folder

['.DS_Store',
 '21_6_25_mestres_modal.TextGrid',
 '21_6_25_mestres_modal.wav',
 'S01',
 '21_06_25_TikTok_14.TextGrid',
 '21_06_25_TikTok_14.wav']

So long as the only folders in the 'data' subfolder are subfolders for separate speakers, the code below will get a list of the speakers.

In [3]:
speakers = [s for s in os.listdir(datadir) if os.path.isdir(os.path.join(datadir, s))]
speakers

['S01']

For any given speaker, we can access their audio and annotation files like so:

In [4]:
spkdir = os.path.join(datadir, 'S01')
wavs = [f for f in os.listdir(spkdir) if f[-4:]=='.wav'] # gets a list of all the wav files
wavs

['21_07_20_espana_302.wav',
 '21_07_20_zona_301.wav',
 '21_07_20_receta_303.wav',
 '21_07_20_paso_309.wav']

In [5]:
tgs = [f for f in os.listdir(spkdir) if f[-9:]=='.TextGrid'] # gets a list of all the textgrids
tgs

['21_07_20_receta_303.TextGrid',
 '21_07_20_zona_301.TextGrid',
 '21_07_20_paso_309.TextGrid',
 '21_07_20_espana_302.TextGrid']

## Iterating

We'll create an empty dataframe with columns for each variable we want to save. Then, for each annotation, we want to do the following:
1. create an associated dataframe
1. add relevant information to the dataframe
1. read in the associated audio file
1. create a harmonicity object
1. calculate the HNR at the times listed in the dataframe and save that information to the dataframe
1. append this dataframe to the full dataframe


### Single loop test

It's sometimes good to see what a single loop will look like. This can help us decide whether we need to write any functions to make our code tidier, or if we'd prefer to have one big for loop.

In [6]:
spk = speakers[0] # set up the single speaker to test 
spkdir = os.path.join(datadir, spk)

tgs = [f for f in os.listdir(spkdir) if f[-9:]=='.TextGrid']
tgfile = tgs[0] # set up the single recording to test

# this gets the "prefix" of the filename so we can find the associated audio file. It also contains other information we 
# might find relevant
prefix = tgfile[:-9] 
prefix

'21_07_20_receta_303'

1. create an associated dataframe

As before, we'll do this using the `as_df()` method from audiolabel. An important note is that our new textgrids have the following tiers:
- vowel: containing either vowel quality (phonemic?) or 'SP' to indicate silence after the IP
- stress: containing intervals labeled with 'n' to indicate primary stress
- word: containing intervals labeled with the lexical items
- topic: containing intervals spanning the phrase of interest, labeled with their general topics
- whisper: containing intervals labeled with non-modal voicing

In [7]:
tg = audiolabel.LabelManager(from_file=os.path.join(datadir, spk, tgfile), from_type='praat')
tg

In [8]:
# create the base dataframe
df = tg.tier('vowel').as_df()
df = df.rename(columns={'text':'VowelQ'}) # this renames the 'text' column to what it really is, 'VowelQ'
df = df[(df.VowelQ!='') & (df.VowelQ.str[0:2]!='SP')] # have to add check for SP now; we can talk about this syntax
df

Unnamed: 0,t1,t2,VowelQ,duration,center
1,8.203826,8.297169,ia,0.093343,8.250498
3,8.796583,8.878143,i,0.08156,8.837363
5,8.991487,9.052258,e,0.060771,9.021872
7,9.157806,9.206583,e,0.048776,9.182195
9,9.315516,9.43881,1a,0.123294,9.377163


2. add relevant information to the dataframe

The first set of information will come from the filename and file structure

In [9]:
# get some information from the file structure and filenames
df['speaker'] = spk

# this splits the string by underscores '_' and then gets the first item in the list. If years go back before 2000, we'll 
# want to revisit this to add 19- or 20- appropriately.
df['year'] = prefix.split('_')[0] 
df['recording'] = prefix
df

Unnamed: 0,t1,t2,VowelQ,duration,center,speaker,year,recording
1,8.203826,8.297169,ia,0.093343,8.250498,S01,21,21_07_20_receta_303
3,8.796583,8.878143,i,0.08156,8.837363,S01,21,21_07_20_receta_303
5,8.991487,9.052258,e,0.060771,9.021872,S01,21,21_07_20_receta_303
7,9.157806,9.206583,e,0.048776,9.182195,S01,21,21_07_20_receta_303
9,9.315516,9.43881,1a,0.123294,9.377163,S01,21,21_07_20_receta_303


The next will come from the annotation TextGrid

In [10]:
# add metadata
df['VoiceQ'] = df.center.apply(lambda t: tg.labels_at(t).whisper.text)
df['stress'] = df.center.apply(lambda t: tg.labels_at(t).stress.text)
df['word'] = df.center.apply(lambda t: tg.labels_at(t).word.text)
df['topic'] = df.center.apply(lambda t: tg.labels_at(t).topic.text)
df

Unnamed: 0,t1,t2,VowelQ,duration,center,speaker,year,recording,VoiceQ,stress,word,topic
1,8.203826,8.297169,ia,0.093343,8.250498,S01,21,21_07_20_receta_303,,,farmacias,over-the-counter antibody tests
3,8.796583,8.878143,i,0.08156,8.837363,S01,21,21_07_20_receta_303,,,sin,over-the-counter antibody tests
5,8.991487,9.052258,e,0.060771,9.021872,S01,21,21_07_20_receta_303,,,receta,over-the-counter antibody tests
7,9.157806,9.206583,e,0.048776,9.182195,S01,21,21_07_20_receta_303,,n,receta,over-the-counter antibody tests
9,9.315516,9.43881,1a,0.123294,9.377163,S01,21,21_07_20_receta_303,whisper,,receta,over-the-counter antibody tests


In [11]:
# add information about relationship between vowels and end of IP
endofIP = [i for i in tg.tier('vowel') if i.text=='SP'][0].t1 # end of IP defined as the left boundary of the 'SP' interval
df['timetoIPend'] = df.t1.apply(lambda t: endofIP - t)
df['VowelN'] = range(len(df), 0, -1) # using the neat range() trick :)
df

Unnamed: 0,t1,t2,VowelQ,duration,center,speaker,year,recording,VoiceQ,stress,word,topic,timetoIPend,VowelN
1,8.203826,8.297169,ia,0.093343,8.250498,S01,21,21_07_20_receta_303,,,farmacias,over-the-counter antibody tests,1.234984,5
3,8.796583,8.878143,i,0.08156,8.837363,S01,21,21_07_20_receta_303,,,sin,over-the-counter antibody tests,0.642227,4
5,8.991487,9.052258,e,0.060771,9.021872,S01,21,21_07_20_receta_303,,,receta,over-the-counter antibody tests,0.447323,3
7,9.157806,9.206583,e,0.048776,9.182195,S01,21,21_07_20_receta_303,,n,receta,over-the-counter antibody tests,0.281003,2
9,9.315516,9.43881,1a,0.123294,9.377163,S01,21,21_07_20_receta_303,whisper,,receta,over-the-counter antibody tests,0.123294,1


3. read in the associated audio file
1. create a harmonicity object
1. calculate the HNR at the times listed in the dataframe and save that information to the dataframe

In [13]:
# read in audio files and create HNR object
au = os.path.join(datadir, spk, prefix+'.wav')
sound = ps.Sound(au)
harm = sound.to_harmonicity_cc()
pitch = sound.to_pitch_cc()

# calculate HNR values and add to df
df['hnr_mid']=df.center.apply(lambda t: harm.get_value(t))
df['hnr_mean']=df.apply(lambda v: pcall(harm, 'Get mean...', v.t1, v.t2), axis=1)
df['f0_mid']=df.center.apply(lambda t: pitch.get_value_at_time(t))
df['f0_mean']=df.apply(lambda v: pcall(pitch, 'Get mean...', v.t1, v.t2, 'Hertz'), axis=1)
df

Unnamed: 0,t1,t2,VowelQ,duration,center,speaker,year,recording,VoiceQ,stress,word,topic,timetoIPend,VowelN,hnr_mid,hnr_mean,f0_mid,f0_mean
1,8.203826,8.297169,ia,0.093343,8.250498,S01,21,21_07_20_receta_303,,,farmacias,over-the-counter antibody tests,1.234984,5,7.944193,9.312631,184.098287,182.484058
3,8.796583,8.878143,i,0.08156,8.837363,S01,21,21_07_20_receta_303,,,sin,over-the-counter antibody tests,0.642227,4,22.262807,17.735294,234.218248,230.420115
5,8.991487,9.052258,e,0.060771,9.021872,S01,21,21_07_20_receta_303,,,receta,over-the-counter antibody tests,0.447323,3,7.62164,8.027255,141.97594,142.527101
7,9.157806,9.206583,e,0.048776,9.182195,S01,21,21_07_20_receta_303,,n,receta,over-the-counter antibody tests,0.281003,2,7.459927,5.237403,86.167782,99.193223
9,9.315516,9.43881,1a,0.123294,9.377163,S01,21,21_07_20_receta_303,whisper,,receta,over-the-counter antibody tests,0.123294,1,-200.0,6.401208,,513.964529


### Creating loops

The code for the single iteration was not so bad, so let's try keeping it as is and creating a for loop around it! But first... create an empty dataframe with the columns we know that we will wind up with. This is a good opportunity to move the columns around if desired. They can be moved later too, but this is a convenient time.

In [14]:
fulldf = pd.DataFrame(columns=['speaker', 'recording', 'year', 'topic', 'VowelN', 't1', 't2', 'timetoIPend', 
                               'VowelQ', 'word', 'VoiceQ', 'stress',
                               'hnr_mid', 'hnr_mean', 'f0_mid', 'f0_mean'])
fulldf

Unnamed: 0,speaker,recording,year,topic,VowelN,t1,t2,timetoIPend,VowelQ,word,VoiceQ,stress,hnr_mid,hnr_mean,f0_mid,f0_mean


For the loop itself, we'll need to iterate through all speakers (for now, there's only one, but there may be more in the future so it's good to get a sense for how that will work), and then all of their textgrids. For each textgrid, we'll do all the stuff we did above, but altogether as part of the loop.

Then before moving onto the next iteration, we'll append that dataframe to the empty dataframe and populate it.

There are also a couple of print commands to help us keep track of where we are in the loops, and so you can more easily see when the code has finished executing.

In [17]:
fulldf = pd.DataFrame(columns=['speaker', 'recording', 'year', 'topic', 'VowelN', 't1', 't2', 'timetoIPend', 
                               'VowelQ', 'word', 'VoiceQ', 'stress',
                               'hnr_mid', 'hnr_mean', 'f0_mid', 'f0_mean'])

for spk in speakers: # iterate through speakers
    
    spkdir = os.path.join(datadir, spk)
    tgs = [f for f in os.listdir(spkdir) if f[-9:]=='.TextGrid']

    for tgfile in tgs: # iterate through textgrids

        prefix = tgfile[:-9]
        print(prefix) # To help us keep track of what iteration we're in

        # create the base dataframe
        tg = audiolabel.LabelManager(from_file=os.path.join(spkdir, tgfile), from_type='praat')
        df = tg.tier('vowel').as_df()
        df = df.rename(columns={'text':'VowelQ'})
        df = df[(df.VowelQ!='') & (df.VowelQ.str[0:2]!='SP')] # have to add check for SP now

        # get some information from the file structure and filenames
        df['speaker'] = spk
        df['year'] = prefix.split('_')[0]
        df['recording'] = prefix

        # add metadata
        df['VoiceQ'] = df.center.apply(lambda t: tg.labels_at(t).whisper.text)
        df['stress'] = df.center.apply(lambda t: tg.labels_at(t).stress.text)
        df['word'] = df.center.apply(lambda t: tg.labels_at(t).word.text)
        df['topic'] = df.center.apply(lambda t: tg.labels_at(t).topic.text)

        # add information about relationship between vowels and end of IP
        endofIP = [i for i in tg.tier('vowel') if i.text[0:2]=='SP'][0].t1
        df['timetoIPend'] = df.t1.apply(lambda t: endofIP - t)
        df['VowelN'] = range(len(df), 0, -1)

        # read in audio files and create HNR object
        au = os.path.join(datadir, 'S01', prefix+'.wav')
        sound = ps.Sound(au)
        harm = sound.to_harmonicity_cc()
        pitch = sound.to_pitch(pitch_ceiling=300) # I added a pitch ceiling here because we were getting some outrageous values

        # calculate HNR values and add to df
        df['hnr_mid']=df.center.apply(lambda t: harm.get_value(t))
        df['hnr_mean']=df.apply(lambda v: pcall(harm, 'Get mean...', v.t1, v.t2), axis=1)
        df['f0_mid']=df.center.apply(lambda t: pitch.get_value_at_time(t))
        df['f0_mean']=df.apply(lambda v: pcall(pitch, 'Get mean...', v.t1, v.t2, 'Hertz'), axis=1)

        fulldf = fulldf.append(df.reset_index(drop=True))

print('done')

21_07_20_receta_303
21_07_20_zona_301
21_07_20_paso_309
21_07_20_espana_302
done


Finally, let's save this!

In [18]:
fulldf.to_csv('./fulldata.csv')