## What this script does:

1. Run Google Cloud Speech-to-Text to get transcription of wav files, which is constrained by existing human(-corrected) transcriptions
2. Output a dictionary file that maps filenames onto transcriptions
3. Read Chronset output and create a dictionary that maps filenames onto onset data
4. Combine the transcription, onset, and context info to generate TextGrid files

## Data structure

The wav files must:
- have a single channel
- be named as {xxx}\_{item_id}{condition}\_{subject_id}.wav (e.g. exp_11a_dfsa.wav)

The keyword data must be in a dictionary where keys are the item IDs and the corresponding values are lists of keywords

The context data (optional) must be in a dictionary where keys are the item IDs and the values are strings of contexts

## Setting up

In [1]:
import os
import io
import shutil
import pickle

import numpy as np
import pandas as pd

from pydub import AudioSegment
import textgrids

In [2]:
# Set the directory with wav files
wav_dir = ""

In [3]:
# Get the names of wav files
filenames = [i for i in os.listdir(wav_dir) if (".wav" in i and "exp" in i)]
bnames = [os.path.splitext(f)[0] for f in filenames]

## Get candidate words

In [4]:
# Get the latest file from /automatic_transcription/keywords/ directory
keyword_list = pickle.load( open( "", "rb" ) )

## Google Cloud Speech-to-Text

In [5]:
from google.cloud import speech

In [6]:
# Replace with the appropriate path to the API key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""

In [7]:
# Create a dictionary that maps a file name onto a transcription for later use
dict_gcloud = dict()

In [8]:
# Create a list to save the file names of items that the transcriber wasn't able to recognize
failed_list = []

Based on a snippet from:
https://cloud.google.com/speech-to-text/docs/sync-recognize

In [9]:
# google cloud api
client = speech.SpeechClient()

for file in filenames:
    transc_list = []
    
    # Load the audio file
    filepath = os.path.join(wav_dir, file)
    with io.open(filepath, "rb") as audio_file:
        content = audio_file.read()
    
    # Load the key words
    item_id = int(os.path.splitext(file)[0].split("_")[1][:-1])
    speech_context = speech.SpeechContext(phrases=keyword_list[item_id])
    

    # Transcription by Google Cloud Speech-to-Text
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        language_code="en-US",
        speech_contexts=[speech_context],
    )

    response = client.recognize(config=config, audio=audio)
    

    
    try:
        
        for result in response.results:
            transc_list.append(result.alternatives[0].transcript)
        transc = " ".join(transc_list)
        
        # If the transcription is empty
        if len(transc) == 0:
            transc = "NOT_RECOGNIZED"
            failed_list.append(file)
            
        print(file+":\t" + transc)
        
        
    except:
        print("Error at ", file)
        failed_list.append(file)
        transc = "FAILED "
        
    dict_gcloud[os.path.splitext(file)[0]] = transc

exp_2a_ckjd.wav:	haunted
exp_1b_arle.wav:	followed
exp_2b_bbxk.wav:	spoken
exp_1a_ahcw.wav:	interviewed
exp_2a_bxxd.wav:	haunted
exp_1a_cbmp.wav:	interviewed
exp_1a_ewki.wav:	seen
exp_2a_fncd.wav:	haunted
exp_1a_bcof.wav:	found
exp_2b_bcof.wav:	blade
exp_1b_fncd.wav:	met
exp_2b_ewki.wav:	become
exp_2b_cbmp.wav:	seen
exp_2b_ahcw.wav:	seen
exp_1b_bxxd.wav:	liked
exp_1a_bbxk.wav:	interviewed
exp_2a_arle.wav:	haunted
exp_1b_ckjd.wav:	talked
exp_2a_cnis.wav:	visited
exp_1b_cnis.wav:	spoken


In [10]:
failed_list

[]

In [11]:
# Catch exceptionally short ones which might cause errors later
short = [(f, dict_gcloud[f]) for f in dict_gcloud.keys() if len(dict_gcloud[f]) < 2]
print(short)

[]


In [12]:
pickle.dump(dict_gcloud, open( "", "wb" ) )

# Outputting files

## Output functions

In [13]:
# This function generates .TextGrid files for Praat
# You can optionally pass a dictionary for the onset and/or for the duration of the files
# The onset function is basically for Chronset
# You can also give it context info to help hand correction

# If the audio files have heterogeneous durations, add an argument of "inconsistent = 1"

# The function replaces the RT with 0.1 if the RT is 0 or shorter, or is NaN, producing an error message
def generate_textgrid(dictionary, output_dir, template, onset = dict(), context = dict(), inconsistent = 0):
    import copy
    
    for k in dictionary.keys():
        
        try:
            # Load the TextGrid template and edit it
            tg = copy.deepcopy(template)
            
            # If the duration of the audio files are inconsistent, edit each TextGrid files to match the duration of the audio files
            if inconsistent == 1:                     
                audio = AudioSegment.from_file(os.path.join(wav_dir, k + ".wav"))
                tg["words"][1].xmax = tg["words"][-1].xmin = audio.duration_seconds - 0.001
                
                if tg.xmax != audio.duration_seconds:
                    tg.xmax = audio.duration_seconds

                    for tier in tg.keys():
                        tg[tier][-1].xmax = audio.duration_seconds
            
            
            tg["words"][1].text = dictionary[k]

            if k in onset.keys():
                # If the onset is not detected or is negative, set 0.1
                if np.isnan(onset[k]) or onset[k] <= 0 :
                    print("No onset for ", k)
                    tg["words"][0].xmax = tg["words"][1].xmin = 0.1
                else:
                    tg["words"][0].xmax = tg["words"][1].xmin = onset[k]

            item_num = int(k.split("_")[1][:-1])

            if item_num in context.keys():
                tg["context"][0].text = context[item_num]

            tg.write(os.path.join(output_dir, k + ".TextGrid"))
            
        except:
            print("Error at ", k)

## Setting up for outputs 

In [14]:
output_dir_c = "" 
dict_gcloud = pickle.load(open("", "rb" ))

## Generate .textgrid files

In [15]:
# Load a template
template = textgrids.TextGrid("./temp_note.TextGrid")

In [16]:
template

File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0.0
xmax = 3.36
tiers? <exists>
size = 3
item []:
    item [1]:
        class = "IntervalTier"
        name = "words"
        xmin = 0.0
        xmax = 3.36
        intervals: size = 3
            intervals [1]:
                xmin = 0.0
                xmax = 0.5695189325640101
                text = ""
            intervals [2]:
                xmin = 0.5695189325640101
                xmax = 2.9325394879192213
                text = "==TRANSCRIPTION=="
            intervals [3]:
                xmin = 2.9325394879192213
                xmax = 3.36
                text = ""
    item [1]:
        class = "IntervalTier"
        name = "context"
        xmin = 0.0
        xmax = 3.36
        intervals: size = 1
            intervals [1]:
                xmin = 0.0
                xmax = 3.36
                text = ""
    item [1]:
        class = "IntervalTier"
        name = "notes"
        xmin = 0.0
        xmax = 3.36
     

In [17]:
# Get the duration of an audio file and edit the template
instance = AudioSegment.from_file(os.path.join(wav_dir, filenames[0]))
template["words"][1].xmax = template["words"][-1].xmin = instance.duration_seconds - 0.001

# Make sure the duration of the audio files matches the template

if template.xmax != instance.duration_seconds:
    template.xmax = instance.duration_seconds
    
    for k in template.keys():
        template[k][-1].xmax = instance.duration_seconds

### Get onset data from Chronset (optional)

In [18]:
onset_dict = dict()

#### Read the output of Chronset

In [19]:
# Directory with Chronset output files
ch_dir = ""

In [20]:
# Optional: Load the output of Chronset
# Create a dictionary that maps the file names onto onset time (sec)
# Note that chronset uses miliseconds while praat uses seconds
for textf in [t for t in os.listdir(ch_dir) if ".txt" in t]:
    onset_df = pd.read_table(os.path.join(ch_dir, textf), names = ["wav", "rt"])
    temp_dict = dict(zip([os.path.splitext(w)[0] for w in onset_df["wav"]], onset_df["rt"] / 1000))
    onset_dict.update(temp_dict)

In [21]:
# Load the dictionary with context information
cont_dict = pickle.load( open( "", "rb" ) )

In [22]:
# Generate textgrids with Google Cloud transcription
# Omit the last argument if you don't use the onset info
# set inconsistent = 1 if the durations of audio files are inconsistent

generate_textgrid(dict_gcloud, output_dir_c, template, onset = onset_dict, context = cont_dict) 


No onset for  exp_2a_ckjd


**Recommended**  
Run common_errors.ipynb after this to correct some common issues regarding homophones/inflections