## What this script does:

1. Extract candidate words a dictionary file (handled with keywords.ipynb)
2. Run Google Cloud Speech-to-Text to get transcription of wav files, which is constrained by existing human(-corrected) transcriptions
3. Output a dictionary file that maps filenames onto transcriptions
4. Combine the transcription, onset, and context info to generate TextGrid files

## Setting up

In [1]:
import os
import io
import shutil
import pickle

import numpy as np
import pandas as pd

from pydub import AudioSegment
import textgrids

In [2]:
# Set the input directory
wav_dir = "/Users/masato/Box/cloze_experiments/filter/batch2/wav"

In [3]:
# Get the names of wav files
filenames = [i for i in os.listdir(wav_dir) if (".wav" in i and "exp" in i)]
bnames = [os.path.splitext(f)[0] for f in filenames]

## Get candidate words

In [4]:
# Get the latest file from /automatic_transcription/keywords/ directory
keyword_list = pickle.load( open( "/Users/masato/Box/cloze_experiments/automatic_transcription/keywords/keywords_1110.p", "rb" ) )

## Google Cloud Speech-to-Text

You can skip this part if you are not using this api

In [None]:
from google.cloud import speech

In [None]:
# Replace with the appropriate path to the API key
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/masato/Documents/Research/transcription/googlecloud/autotranscription-test-e12d3589a265.json"

In [None]:
# Create a dictionary that maps a file name onto a transcription for later use
dict_gcloud = dict()

In [None]:
# Create a list to save the file names of items that the transcriber wasn't able to recognize
failed_list = []

Based on a snippet from:
https://cloud.google.com/speech-to-text/docs/sync-recognize

In [None]:
# google cloud api
client = speech.SpeechClient()

for file in filenames:
    transc_list = []
    
    # Load the audio file
    filepath = os.path.join(wav_dir, file)
    with io.open(filepath, "rb") as audio_file:
        content = audio_file.read()
    
    # Load the key words
    item_id = int(os.path.splitext(file)[0].split("_")[1][:-1])
    speech_context = speech.SpeechContext(phrases=keyword_list[item_id])
    

    # Transcription by Google Cloud Speech-to-Text
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        language_code="en-US",
        speech_contexts=[speech_context],
    )

    response = client.recognize(config=config, audio=audio)
    

    
    try:
        
        for result in response.results:
            transc_list.append(result.alternatives[0].transcript)
        transc = " ".join(transc_list)
        
        # If the transcription is empty
        if len(transc) == 0:
            transc = "NOT_RECOGNIZED"
            failed_list.append(file)
            
        print(file+":\t" + transc)
        
        
    except:
        print("Error at ", file)
        failed_list.append(file)
        transc = "FAILED "
        
    dict_gcloud[os.path.splitext(file)[0]] = transc

In [None]:
failed_list

In [None]:
# Catch exceptionally short ones which might cause errors later
short = [(f, dict_gcloud[f]) for f in dict_gcloud.keys() if len(dict_gcloud[f]) < 2]
print(short)

In [None]:
pickle.dump(dict_gcloud, open( "", "wb" ) )

# Outputting files

## Output functions

In [5]:
# This function generates .TextGrid files for Praat
# You can optionally give it a dictionary for the onset and/or for the duration of the files
# The onset function is basically for Chronset
# You can also give it context info

# This now works with inconsistent durations
# In that case set add an argument of "inconsistent = 1"

# The function replaces the RT with 0.1 if the RT is 0 or shorter, or is NaN, producing an error message
def generate_textgrid(dictionary, output_dir, template, onset = dict(), context = dict(), inconsistent = 0):
    import copy
    
    for k in dictionary.keys():
        
        try:
            # Load the TextGrid template and edit it
            tg = copy.deepcopy(template)
            
            # If the duration of the audio files are inconsistent, edit each TextGrid files to match the duration of the audio files
            if inconsistent == 1:                     
                audio = AudioSegment.from_file(os.path.join(wav_dir, k + ".wav"))
                tg["words"][1].xmax = tg["words"][-1].xmin = audio.duration_seconds - 0.001
                
                if tg.xmax != audio.duration_seconds:
                    tg.xmax = audio.duration_seconds

                    for tier in tg.keys():
                        tg[tier][-1].xmax = audio.duration_seconds
            
            
            tg["words"][1].text = dictionary[k]

            if k in onset.keys():
                # If the onset is not detected or is negative, set 0.1
                if np.isnan(onset[k]) or onset[k] <= 0 :
                    print("No onset for ", k)
                    tg["words"][0].xmax = tg["words"][1].xmin = 0.1
                else:
                    tg["words"][0].xmax = tg["words"][1].xmin = onset[k]

            item_num = int(k.split("_")[1][:-1])

            if item_num in context.keys():
                tg["context"][0].text = context[item_num]

            tg.write(os.path.join(output_dir, k + ".TextGrid"))
            
        except:
            print("Error at ", k)

In [None]:
# Get the duration of an audio file and edit the template
instance = AudioSegment.from_file(os.path.join(wav_dir, filenames[0]))
template["words"][1].xmax = template["words"][-1].xmin = instance.duration_seconds - 0.001

# Make sure the duration of the audio files matches the template

if template.xmax != instance.duration_seconds:
    template.xmax = instance.duration_seconds
    
    for k in template.keys():
        template[k][-1].xmax = instance.duration_seconds

## Setting up for outputs 

In [18]:
# You don't need this line if you are not using Google Cloud Speech-to-Text
output_dir_c = "/Users/masato/Box/cloze_experiments/filter/batch2/tg_raw_new" 
dict_gcloud = pickle.load( open( "/Users/masato/Box/cloze_experiments/filter/batch2/gcloud.p", "rb" ) )

## Generate .textgrid files

In [29]:
# Load a template
template = textgrids.TextGrid("/Users/masato/Box/cloze_experiments/automatic_transcription/temp_note.TextGrid")

In [13]:
template

File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0.0
xmax = 3.36
tiers? <exists>
size = 3
item []:
    item [1]:
        class = "IntervalTier"
        name = "words"
        xmin = 0.0
        xmax = 3.36
        intervals: size = 3
            intervals [1]:
                xmin = 0.0
                xmax = 0.5695189325640101
                text = ""
            intervals [2]:
                xmin = 0.5695189325640101
                xmax = 2.9325394879192213
                text = "==TRANSCRIPTION=="
            intervals [3]:
                xmin = 2.9325394879192213
                xmax = 3.36
                text = ""
    item [1]:
        class = "IntervalTier"
        name = "context"
        xmin = 0.0
        xmax = 3.36
        intervals: size = 1
            intervals [1]:
                xmin = 0.0
                xmax = 3.36
                text = ""
    item [1]:
        class = "IntervalTier"
        name = "notes"
        xmin = 0.0
        xmax = 3.36
     

In [30]:
# Get the duration of an audio file and edit the template
instance = AudioSegment.from_file(os.path.join(wav_dir, filenames[0]))
template["words"][1].xmax = template["words"][-1].xmin = instance.duration_seconds - 0.001

# Make sure the duration of the audio files matches the template

if template.xmax != instance.duration_seconds:
    template.xmax = instance.duration_seconds
    
    for k in template.keys():
        template[k][-1].xmax = instance.duration_seconds

### Get onset data from Chronset (optional)

In [22]:
onset_dict = dict()

#### Read the output of Chronset

In [23]:
# Directory with Chronset output files
ch_dir = "/Users/masato/Box/cloze_experiments/filter/batch2/chronset/output"

In [24]:
# Optional: Load the output of Chronset
# Create a dictionary that maps the file names onto onset time (sec)
# Note that chronset uses miliseconds while praat uses seconds
for textf in [t for t in os.listdir(ch_dir) if ".txt" in t]:
    onset_df = pd.read_table(os.path.join(ch_dir, textf), names = ["wav", "rt"])
    temp_dict = dict(zip([os.path.splitext(w)[0] for w in onset_df["wav"]], onset_df["rt"] / 1000))
    onset_dict.update(temp_dict)

In [25]:
# Load the dictionary with context information
cont_dict = pickle.load( open( "/Users/masato/Box/cloze_experiments/automatic_transcription/data/context.p", "rb" ) )

In [26]:
# Generate textgrids with Google Cloud transcription
# Omit the last argument if you don't use the onset info
# set inconsistent = 1 if the durations of audio files are inconsistent

generate_textgrid(dict_gcloud, output_dir_c, template, onset = onset_dict, context = cont_dict) 


No onset for  exp_10a_fobi
No onset for  exp_11a_pqpc
No onset for  exp_11a_yafl
No onset for  exp_12b_jpej
No onset for  exp_12b_pqpc
No onset for  exp_13a_ewki
No onset for  exp_13a_jpej
No onset for  exp_13b_hspi
No onset for  exp_13b_rmht
No onset for  exp_13b_ydyq
No onset for  exp_14a_ydyq
No onset for  exp_14a_zscx
No onset for  exp_14b_jpej
No onset for  exp_14b_pqpc
No onset for  exp_15a_jbjb
No onset for  exp_15a_qpmx
No onset for  exp_15a_xiih
No onset for  exp_15b_ldan
No onset for  exp_16b_jbjb
No onset for  exp_16b_pqpc
No onset for  exp_17a_bcof
No onset for  exp_18a_zscx
No onset for  exp_18b_kxnd
No onset for  exp_1a_jbjb
No onset for  exp_1b_fobi
No onset for  exp_20b_jpej
No onset for  exp_21a_jpej
No onset for  exp_21b_hspi
No onset for  exp_22a_rvun
No onset for  exp_23a_pnuh
No onset for  exp_23a_qpmx
No onset for  exp_23b_ynpp
No onset for  exp_24b_pnuh
No onset for  exp_25b_ydyq
No onset for  exp_26a_lopr
No onset for  exp_26b_jpej
No onset for  exp_26b_mage
No 

**Recommended**  
Run common_errors.ipynb after this to correct some common issues regarding homophones/inflections