# Speech To Text Examples

## Creating Training Example
Google text-to-speech API reference [here](https://cloud.google.com/text-to-speech/docs/reference/rpc/google.cloud.texttospeech.v1)

In [328]:
import os
import numpy as np
import random
import IPython.display as ipd
from pydub import AudioSegment
from google.cloud import texttospeech
from essential_generators import DocumentGenerator

In [None]:
BACKGROUND_DIRECTORY = "../raw_data/background_data/"
BACKGROUND_AUDIONAMES = [name for name in os.listdir(BACKGROUND_DIRECTORY) if name.endswith("wav")]    
gen = DocumentGenerator()

In [311]:
# Generates a 22 words sentence (~10s) with %mean_positives number of 'basically'
def generate_sentence(mean_positives):
    total = 22
    num_positives = int(random.gauss(mean_positives, 1))
    num_negatives = total - num_positives
    words = []
    
    while len(words) < num_negatives:
        para = gen.paragraph()
        words += para.split()
        
    words = words[:num_negatives]
    
    INDEXES = list(range(num_negatives))
    POSITIVE = "basically"
    positive_indexes = random.sample(INDEXES, k = num_positives)
    for index in positive_indexes:
        words.insert(index, POSITIVE)
    
    assert len(words) == total, "result need to be exactly 22 words (~10s)"
    sentence = " ".join(words)
    return sentence

In [312]:
sentence = generate_sentence(4)
print(sentence)

Characters (excluding (linacs) basically is that there's something sinister in laughter.. In 1897, basically large foreign-language-speaking population or. Jury in basically kirchner


In [348]:
def synthesize_ssml(ssml):
    """Returns audio segment of synthesized speech from the input string of ssml. 

    Note: ssml must be well-formed according to:
        https://www.w3.org/TR/speech-synthesis/

    Example: <speak>Hello there.</speak>
    """    
    credential_path = '../credentials/basically-england.json'
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
    
    client = texttospeech.TextToSpeechClient()

    input_text = texttospeech.types.SynthesisInput(ssml=ssml)

    # Note: the voice can also be specified by name.
    # Names of voices can be retrieved with client.list_voices().
    voice = texttospeech.types.VoiceSelectionParams(
        language_code='en-US',
        name='en-US-Standard-C',
        ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE)

    audio_config = texttospeech.types.AudioConfig(
        audio_encoding=texttospeech.enums.AudioEncoding.MP3)

    response = client.synthesize_speech(input_text, voice, audio_config)

    temp_filepath = "temp.mp3"
    # The response's audio_content is binary.
    with open(temp_filepath, 'wb') as out:
        out.write(response.audio_content)
        segment = AudioSegment.from_mp3(temp_filepath)
        os.remove(temp_filepath)
    
    return segment

In [349]:
segment = synthesize_ssml(sentence)

<pydub.audio_segment.AudioSegment object at 0x111b32090>


In [342]:
segment = AudioSegment.from_mp3("temp.mp3")

In [314]:
def process_audio(segment):
    # Trim or pad audio segment to 10000ms
    segment = segment[:10000]
    padding = AudioSegment.silent(duration=10000)
    segment = padding.overlay(segment)
    # Set frame rate to 123000
    segment = segment.set_channels(1)
    segment = segment.set_frame_rate(123000)
    return segment

In [350]:
segment = process_audio(segment)

In [308]:
def random_background(background_audionames, debug=False):
    """
    Given a list of background audio names
    Return a randomly selected background audio
    """
    # generate a random audio
    random_audio = np.random.choice(background_audionames, 1, replace=False)[0]
    if debug:
        print("Selecting background file randomly:\n    - {}".format(random_audio))
    # load audio
    global BACKGROUND_DIRECTORY
    link = BACKGROUND_DIRECTORY + random_audio
    return AudioSegment.from_wav(link)

def overlay_background(segment, debug=False):
    background = random_background(BACKGROUND_AUDIONAMES, debug)
    background = background - 30
    return background.overlay(segment, position = 0)

In [351]:
segment = overlay_background(segment)

In [355]:
segment.duration_seconds

10.0

In [330]:
def create_training_example(mean_positives, output_filepath):
    sentence = generate_sentence(mean_positives)
    segment = synthesize_ssml(sentence)
    segment = process_audio(segment)
    # Add background for the next step
    # segment = overlay_background(segment)
    
    assert len(segment) == 10.0, "segment needs to be exactly 10s long."
    
    # Export as wav
    segment.export(output_filepath, format='wav')

In [356]:
output_filepath = "output.wav"
create_training_example(4, output_filepath)
ipd.Audio('output.wav')

<pydub.audio_segment.AudioSegment object at 0x11782d110>


In [67]:
pre_1 = AudioSegment.from_mp3("pre_1.mp3")
basically = AudioSegment.from_mp3("basically.mp3")
pre_2 = AudioSegment.from_mp3("pre_2.mp3")

output = pre_1 + basically + pre_2

output.export("output.mp3", format="mp3")

<_io.BufferedRandom name='output.mp3'>

In [68]:
ipd.Audio('output.mp3')