# Moth Transcripts to Gentle

The Huth Moth transcripts are provided within Praat. There are two issues with this format:
1. There is no joint transcript including punctuation (allowing us to present the next-word prediction framework)
2. Our pipeline uses Gentle as its starting point to process files

We load the Praat files and align it with a transcript generated through ChatGPT (adjusting mismatched words).

In [1]:
%load_ext autoreload
import os, sys, glob
import json
import re
import numpy as np
import pandas as pd
from pathlib import Path
from praatio import textgrid as tgio
import json
import shutil

sys.path.append('../utils/')

from text_utils import strip_punctuation
# from text_utils import get_pos_tags, get_lemma

In [2]:
def load_clean_textgrid(praat_fn):
    '''
    Load a praat textgrid file using PraatIO
    '''
    
    # things to remove from the textgrid (indicates laughing, chewing, pauses etc)
    REMOVE_CHARACTERS = ['sp', 'br', 'lg', 'cg', 'ls', 'ns', 'sl', 'ig',
                         '{sp}', '{br}', '{lg}', '{cg}', '{ls}', '{ns}', '{sl}', '{ig}', 'pause']
    
    # open the textgrid
    tg = tgio.openTextgrid(praat_fn, includeEmptyIntervals=False, reportingMode="warning") 
    
    # remove entries of unwanted characters
    for tier_name in tg.tierNames:
        # get the current tier
        tier = tg.getTier(tier_name)
        
        for x in tier.entries:
            if x[-1].lower() in REMOVE_CHARACTERS:
                tier.deleteEntry(x)

#         for char in REMOVE_CHARACTERS:
#             upper_set = set(tier.find(char.upper()))
#             lower_set = set(tier.find(char.lower()))
#             remove_idxs = sorted(upper_set.union(lower_set))

#             # go through each index and remove
#             for idx in remove_idxs:
#                 try:
#                     tier.deleteEntry(tier.entries[idx])
#                 except:
#                     print (idx)
    
#     # go through each entry at the word tier, remove the items
#     words = [x for x in tg.getTier('word').entries if x[-1].lower() not in REMOVE_CHARACTERS]
#     phones = [x for x in tg.getTier('phone').entries if x[-1].lower() not in REMOVE_CHARACTERS]
#     words = tg.getTier('word').entries
#     phones = tg.getTier('phone').entries
    return tg

def load_transcription(transcript_fn):
    
    with open(transcript_fn, 'r') as f: #open the file
        contents = f.readlines() #put the lines to a variable (list).
        
    # get the transcription stripped of punctuation
    words_transcribed = strip_punctuation(contents).split()
    
    return contents, words_transcribed

def textgrid_to_gentle(praat_fn, transcript_fn):
    '''
    Transform Moth dataset textgrid files into gentle format
    '''
    
    textgrid = load_clean_textgrid(praat_fn)
    tg_words = textgrid.getTier('word')
    
    contents, words_transcribed = load_transcription(transcript_fn)
    
    assert (len(tg_words) == len(words_transcribed))
    
    # create the dictionary to store things in
    # put the transcript in the raw form
    align = {}
    align['transcript'] = contents[0]
    align['words'] = []
    
    # Taken from Kaldi metasentence tokenizer
    # splits the transcript based on any punctuation besides for apostrophes and hyphens
    regex_split_pattern = r'(\w|\.\w|\:\w|\’\w|\'\w|\-\w)+'
    
    iterator = list(re.finditer(regex_split_pattern, ''.join(contents), re.UNICODE))
    n_items = len(list(iterator))
    
    # make sure the iterator matches the length
    assert (n_items == len(tg_words) == len(words_transcribed))
    
    # if all matches we're good to go
    for word_info, m in zip(tg_words, iterator):
        # span of the word in characters relative to the overall string
        start_offset, end_offset = m.span()
        word = m.group()
        
        # crop textgrid to the word
        cropped_grid = textgrid.crop(cropStart=word_info[0], cropEnd=word_info[1], mode='truncated', rebaseToZero=False)
        tg_phones = cropped_grid.getTier('phone').entries
        word_phones = []

        for phone_info in tg_phones:
            phone = re.sub(r'\d+', '', phone_info[-1])
            duration = phone_info[1] - phone_info[0]
            word_phones.append({'duration': duration, 'phone': phone})

        word_align = {
            'alignedWord': word.lower(),
            "case": "success",
            'word': word,
            'start': word_info[0],
            'end': word_info[1],
            'phones': word_phones,
            "startOffset": start_offset,
            "endOffset": end_offset,
        }
        
        align['words'].append(word_align)
        
    return align

## Set paths 

These are paths to the main directory and the stimulus directory

CHANGE THE PATH BELOW TO MATCH YOUR DIRECTORY --> FinnLabTasks/transcript_alignment/

In [3]:
base_dir = '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/'
datasets_dir = '/dartfs/rc/lab/F/FinnLab/datasets/'
# stim_dir = os.path.join(datasets_dir, 'IBC/stimuli/lepetitprince/')

# for prepping for onlin eexpt
stim_dir = os.path.join(base_dir, 'stimuli')

## Load Praat files

We first get all the filenames of TextGrid files within the stimulus directory. We also print out the number of files within this directory.

In [4]:
praat_fns = sorted(glob.glob(os.path.join(stim_dir, 'praat', '*.TextGrid')))

print (f'Total files in dataset: {len(praat_fns)}')

Total files in dataset: 29


<b>Note:</b> This is <b>very</b> likely not to work on the first time. Follow the steps below to get the file to load!

We are going to load a Praat TextGrid file. This will probably not work on the first time due to overlapping timestamps. To address this, do the following:
1. Open the .TextGrid file in a text editor (e.g., TextEdit, SublimeText)
2. Look at the Python error -- you will need to manually adjust these overlapping times. Copy the first number in the second parentheses:
    - <b>Example error:</b> Two intervals in the same tier overlap in time: (START_1, END_1, sp) and (START_2, END_2, B)
    - For this error, copy the number "START_2"
3. Go to the text editor, and search (cmd + F) for the copied number (e.g., "START_2").
4. Adjust the word/phoneme before's end time (e.g., END_1) to match the copied number ("START_2").
5. Save the file and rerun the code
6. Repeat for as many times until the file loads

In [5]:
# select a file number to load -- we then select that file from the list of alphabetized file names
file_num = -1
praat_fn = praat_fns[file_num]

# now grab the current filename as a path -- print out only the filename (no extension)
filepath = Path(praat_fn)
stim_name = filepath.stem
print (f'Stimulus name: {filepath.stem}')

# attempt to load the praat file -- if this doesn't work, follow the steps above 
tg = tgio.openTextgrid(praat_fns[file_num], includeEmptyIntervals=False, reportingMode="warning") 

print (f'Successfully loaded Praat file!')


Stimulus name: wheretheressmoke
Successfully loaded Praat file!


## Adjust the words to have punctuation

After loading the transcript using Praat, we concatenate all the transcript words and pass it to ChatGPT to ensure punctuation. Then we need to go through comparing word by word making sure of the following:
-  The new transcript matches the original number of words
- Words are spelled correctly (as full words)

This cell below will print out all the words of the TextGrid as a string. You will need to do the following:
1. Open ChatGPT: https://chat.openai.com/chat
2. Type the following instructions: "Add punctuation and capitalization to the following but change nothing else:"
3. Copy and paste the transcript below <i>after</i> the instructions

In [6]:
def get_textgrid_words(textgrid):
    '''
    Extracts the words in the textgrid to show in a legible format
    '''
    words = [strip_punctuation(x[-1]) for x in textgrid.getTier('word').entries]
    return words

# load the textgrid removing all enunciations
textgrid = load_clean_textgrid(praat_fn)

# gets all the words in the textgrid as an interpretable string
tg_words = get_textgrid_words(textgrid)
print (' '.join(tg_words))

I REACHED OVER AND SECRETLY UNDID MY SEATBELT AND WHEN HIS FOOT HIT THE BRAKE AT THE RED LIGHT I FLUNG OPEN THE DOOR AND I RAN I HAD NO SHOES ON I WAS CRYING I HAD NO WALLET BUT I WAS OKAY BECAUSE I HAD MY CIGARETTES AND I DIDNT WANT ANY PART OF FREEDOM IF I DIDNT HAVE MY CIGARETTES WHEN YOU LIVE WITH SOMEONE WHO HAS A TEMPER A VERY BAD TEMPER A VERY VERY BAD TEMPER YOU LEARN TO PLAY AROUND THAT YOU LEARN THIS TIME ILL PLAY POSSUM AND NEXT TIME ILL JUST BE REAL NICE OR ILL SAY YES TO EVERYTHING OR YOU MAKE YOURSELF SCARCE OR YOU RUN AND THIS WAS ONE OF THE TIMES WHEN YOU JUST RUN AND AS I WAS RUNNING I THOUGHT THIS WAS A GREAT PLACE TO JUMP OUT BECAUSE THERE WERE BIG LAWNS AND THERE WERE CULDESACS AND SOMETIMES HE WOULD COME AFTER ME AND DRIVE AND YELL STUFF AT ME TO GET BACK IN GET BACK IN AND I WAS LIKE NO IM OUT OF HERE THIS IS GREAT AND I WENT AND HID BEHIND A CABANA AND HE LEFT AND I HAD MY CIGARETTES AND UH I STARTED TO WALK IN THIS BEAUTIFUL NEIGHBORHOOD IT WAS TENTHIRTY AT NIGH

## Create a transcript file

ChatGPT will then print out a verion of the transcript with punctuation. However, we need to double-check that the words match the original transcript. After getting the transcript from ChatGPT:
1. Go to the directory '/stimuli/transcripts/' 
2. Create a text file names "STIMULUSNAME.txt" (where STIMULUSNAME is the name of the stimulus - printed out above)
3. Paste the transcript from ChatGPT into the text file

You should now be able to load the file in this notebook

In [7]:
def compare_praat_to_transcript(words_original, words_transcribed):
    '''
    Compares words from TextGrid and ChatGPT transcript word by word
    '''
    
    for i, (word_orig, word_transc) in enumerate(zip(words_original, words_transcribed)):
        if word_orig.lower() != word_transc.lower():
            print (f'Word index: {i}')
            print (f'TextGrid word: {word_orig}')
            print (f'Transcript word: {word_transc}')
            print (f'Word context: {words_original[i-5:i+5]}')
            break
    
    if i+1 == len(words_original):
        print (f'Finished transcript!')

## Check the transcript with the original file

Run the following cell to compare words from the TextGrid to words from the ChatGPT transcript.

Sometimes words will be misaligned:
- ChatGPT may have missed some words
- The Praat words may be misspelled, or hyphenated words may have been treated separately (e.g., eighty-four --> eighty four)

You will need to correct this in either 1) the transcript or 2) the Praat file and make note of the change within the tracking document

In [8]:
transcript_fn = os.path.join(stim_dir, 'transcripts', f'{stim_name}_transcript.txt')

# load the textgrid and get all words
textgrid = load_clean_textgrid(praat_fn)
words_original = get_textgrid_words(textgrid)

# load the ChatGPT created transcript
_, words_transcribed = load_transcription(transcript_fn)

compare_praat_to_transcript(words_original, words_transcribed)

Finished transcript!


## Create a gentle align file from Praat

In [11]:
gentle_stim_dir = os.path.join(stim_dir, 'gentle', stim_name)

# if the directory does not exist, make the directory
if not os.path.exists(gentle_stim_dir):
    os.makedirs(gentle_stim_dir)

Now that the directory is created, we will do the following:
- Write the aligned file to the directory
- Move a copy of the stimulus audio to the directory
- Move a copy of the transcript to the directory

In [12]:
praat_fn = praat_fns[file_num]
transcript_fn = os.path.join(stim_dir, 'transcripts', f'{stim_name}_transcript.txt')

# given the two files, creates a file in gentle aligned format
align_json = textgrid_to_gentle(praat_fn, transcript_fn)

# write the file out to the directory
with open(os.path.join(gentle_stim_dir, 'align.json'), 'w') as f:
    json.dump(align_json, f)
    
# copy the transcript file renaming it to "transcript.txt" matching gentle convention
shutil.copyfile(
    transcript_fn, 
    os.path.join(gentle_stim_dir, 'transcript.txt')
)

# copy the stimulus audio file renaming it to "a.wav" matching gentle convention
shutil.copyfile(
    os.path.join(stim_dir, 'audio', f'{stim_name}.wav'), 
    os.path.join(gentle_stim_dir, 'a.wav')
)

'/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/stimuli/gentle/wheretheressmoke/a.wav'

### old for testing

In [134]:
df.to_json(f'/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/stimuli/preprocessed/howtodraw/howtodraw_transcript-preprocessed.json', orient='records')

In [133]:
df = pd.read_csv(f'/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/stimuli/preprocessed/howtodraw/howtodraw_transcript-preprocessed.csv')



In [1588]:
print (len(tg_words))
print (len (words_transcribed))

2740
2742


In [1590]:
tg_words

<praatio.data_classes.interval_tier.IntervalTier at 0x2b22e9c7f880>

In [1591]:
words_transcribed

['My',
 'story',
 'is',
 'about',
 'a',
 'number',
 'of',
 'jobs',
 'that',
 'sort',
 'of',
 'shaped',
 'my',
 'life',
 'and',
 'shaped',
 'my',
 'whole',
 'destiny',
 'And',
 'it',
 'was',
 'a',
 'couple',
 'of',
 'jobs',
 'I',
 'had',
 'uh',
 'during',
 'the',
 'ages',
 'of',
 'twenty',
 'to',
 'twentyone',
 'So',
 'the',
 'story',
 'begins',
 'in',
 'in',
 'uh',
 'yes',
 'I',
 'we',
 'yes',
 'I',
 'was',
 'employed',
 'back',
 'then',
 'Ok',
 'I',
 'dont',
 'know',
 'why',
 'you',
 'laughed',
 'but',
 'Ill',
 'Ill',
 'accept',
 'that',
 'Its',
 'a',
 'good',
 'sign',
 'Ok',
 'thinking',
 'out',
 'loud',
 'here',
 'Alright',
 'so',
 'in',
 'nineteen',
 'eightyfour',
 'I',
 'was',
 'a',
 'sophomore',
 'at',
 'Princeton',
 'You',
 'can',
 'laugh',
 'at',
 'that',
 'if',
 'you',
 'like',
 'No',
 'Alright',
 'So',
 'in',
 'nineteen',
 'eightyfour',
 'I',
 'was',
 'a',
 'sophomore',
 'at',
 'Princeton',
 'I',
 'wanted',
 'to',
 'take',
 'a',
 'year',
 'off',
 'because',
 'I',
 'had',
 'I'

In [1589]:
textgrid = load_clean_textgrid(praat_fn)
tg_words = textgrid.getTier('word')

contents, words_transcribed = load_transcription(transcript_fn)

for tg_word, transcribed_word 

assert (len(tg_words) == len(words_transcribed))

# create the dictionary to store things in
# put the transcript in the raw form
align = {}
align['transcript'] = contents[0]
align['words'] = []

# Taken from Kaldi metasentence tokenizer
# splits the transcript based on any punctuation besides for apostrophes and hyphens
regex_split_pattern = r'(\w|\.\w|\:\w|\’\w|\'\w|\-\w)+'

iterator = list(re.finditer(regex_split_pattern, ''.join(contents), re.UNICODE))
n_items = len(list(iterator))
# make sure the iterator matches the length
# assert (n_items == len(tg_words) == len(words_transcribed))


## this block helps find what words are wrong
for word_info, m in zip(tg_words, iterator):
    
    if word_info[-1].lower() != m.group().lower():
        print (m)
    
sys.exit(0)

# # if all matches we're good to go
# for word_info, m in zip(tg_words, iterator):
    
#     # span of the word in characters relative to the overall string
#     start_offset, end_offset = m.span()
#     word = m.group()
    
#     # crop textgrid to the word
#     cropped_grid = textgrid.crop(cropStart=word_info[0], cropEnd=word_info[1], mode='truncated', rebaseToZero=False)
#     tg_phones = cropped_grid.getTier('phone').entries
#     word_phones = []
    
#     for phone_info in tg_phones:
#         phone = re.sub(r'\d+', '', phone_info[-1])
#         duration = phone_info[1] - phone_info[0]
#         word_phones.append({'duration': duration, 'phone': phone})
    
#     word_align = {
#         'alignedWord': word.lower(),
#         "case": "success",
#         'word': word,
#         'start': word_info[0],
#         'end': word_info[1],
#         'phones': word_phones,
#         "startOffset": start_offset,
#         "endOffset": end_offset,
#     }

#     align['words'].append(word_align)

# return align

AssertionError: 