In [None]:
import csv
import json
import re
import base64
import io
import opuslib
import soundfile as sf
from pydub import AudioSegment
from google.cloud import speech

speech_client = speech.SpeechClient()


In [None]:
framerate = 16000

tsv_filepath = '/home/myuser/my_demo_data.tsv'  # Used for testing purposes

In [None]:

def tsv_to_json(filename):
    '''
    Convert TSV (provided by Sony) to JSON 
    so that its easier to work with.
    Used for testing only.
    '''
    with open(filename, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        header = next(reader)
        data = [dict(zip(header, row)) for row in reader]
    return data


def decode_base64_audio(base64_audio, framerate=16000, channels=1, framesize=360):
    '''
    Decodes base64 encoded audio packets
    '''
    audio_bytes = base64.b64decode(base64_audio)
    decoder = opuslib.Decoder(framerate, channels)
    decoded_audio = decoder.decode(audio_bytes, framesize)
    return decoded_audio


def wrap_opus_packets_in_ogg(raw_packets, framerate, channels=1):
    '''
    Wrap raw opus packets in an ogg container.
    '''
    opus_audio = b''.join(raw_packets)
    audio_segment = AudioSegment(opus_audio, frame_rate=framerate, sample_width=2, channels=channels)
    
    in_memory_obj = io.BytesIO()
    
    # Export audio segment as ogg
    audio_segment.export(in_memory_obj, format="ogg")
    
    # Move the pointer to beginning of memory obj
    in_memory_obj.seek(0)
    
    print(type(in_memory_obj.getvalue()))
    return in_memory_obj


def stream_ogg_to_speech(speech_client, packets, encoding=speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, sample_rate_hertz=16000, language_code="en-US"):
    '''
    Stream ogg container to Google Speech to Text using streaming_recognize
    '''
    # Configure the OGG audio settings
    config = speech.RecognitionConfig(
        encoding=encoding,
        sample_rate_hertz=sample_rate_hertz,
        language_code=language_code,
    )

    # Create a streaming recognizer object
    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    requests = (
        # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#streamingrecognizerequest
        speech.StreamingRecognizeRequest(audio_content=chunk) 
        for chunk in packets
    )

    
    # Start the streaming recognition
    responses = speech_client.streaming_recognize(streaming_config, requests)
    
    print('process streaming responses...')
    for response in responses:
        for result in response.results:
            if result.is_final:
                print("Final transcription: {}".format(result.alternatives[0].transcript))
            else:
                print("Interim transcription: {}".format(result.alternatives[0].transcript))



In [None]:
audio_records = tsv_to_json(tsv_filepath)

In [None]:
# Extract only the Audio column for each record from the TSV
audio_records_base64_encoded = [json.loads(ar['Audio']) for ar in audio_records]
print(f'Number of test Audio records in TSV: {len(audio_records_base64_encoded)}')


## Test Audio Stream 1

In [None]:
print(f"Expected text output:\n{audio_records[0]['Transcription']}")

In [None]:
audio_record_row1 = audio_records_base64_encoded[0]

decoded_audio_packets = []
for base64_audio in audio_record_row1:
    decoded_audio_packets.append(decode_base64_audio(base64_audio))

print(f'Number of Base64 Audio Packets:   {len(audio_record_row1)}')
print(f'Number of decoded audio packets:  {len(decoded_audio_packets)}')

In [None]:
print(f'Data type of decoded_audio_packets: {type(decoded_audio_packets[0])}')

In [None]:
# Wrap raw audio packets in ogg container
ogg_container = wrap_opus_packets_in_ogg(decoded_audio_packets, framerate=framerate)

In [None]:
# Send ogg container to Google Speech to Text
stream_ogg_to_speech(speech_client, ogg_container)


In [None]:
print(f"Expected text output:\n{audio_records[0]['Transcription']}")

## Test Audio Stream 2

In [None]:
print(f"Expected text output:\n{audio_records[1]['Transcription']}")

In [None]:
audio_record_row1 = audio_records_base64_encoded[1]

decoded_audio_packets = []
for base64_audio in audio_record_row1:
    decoded_audio_packets.append(decode_base64_audio(base64_audio))

print(f'Number of Base64 Audio Packets:   {len(audio_record_row1)}')
print(f'Number of decoded audio packets:  {len(decoded_audio_packets)}')

In [None]:
print(f'Data type of decoded_audio_packets: {type(decoded_audio_packets[0])}')

In [None]:
# Wrap raw audio packets in ogg container
ogg_container = wrap_opus_packets_in_ogg(decoded_audio_packets, framerate=framerate)

In [None]:
# Send ogg container to Google Speech to Text
stream_ogg_to_speech(speech_client, ogg_container)

In [None]:
print(f"Expected text output:\n{audio_records[1]['Transcription']}")

## Test Audio Stream 3

In [None]:
print(f"Expected text output:\n{audio_records[2]['Transcription']}")

In [None]:
audio_record_row1 = audio_records_base64_encoded[2]

decoded_audio_packets = []
for base64_audio in audio_record_row1:
    decoded_audio_packets.append(decode_base64_audio(base64_audio))

print(f'Number of Base64 Audio Packets:   {len(audio_record_row1)}')
print(f'Number of decoded audio packets:  {len(decoded_audio_packets)}')

In [None]:
print(f'Data type of decoded_audio_packets: {type(decoded_audio_packets[0])}')

In [None]:
# Wrap raw audio packets in ogg container
ogg_container = wrap_opus_packets_in_ogg(decoded_audio_packets, framerate=framerate)

In [None]:
ogg_container

In [None]:
# Send ogg container to Google Speech to Text
stream_ogg_to_speech(speech_client, ogg_container)

In [None]:
print(f"Expected text output:\n{audio_records[2]['Transcription']}")