In [505]:
import math
import numpy as np
import re
from IPython.display import Audio, display
from scipy.io import wavfile

In [774]:
import io
import os

# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

# Sets authentication environment variable
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'auth/Chemistry-NU.json'

# Instantiates a client
client = speech.SpeechClient()

def gcs(signal, fs):
    audio = types.RecognitionAudio(content=signal.tobytes())

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=fs,
        language_code='en-US',
        max_alternatives=1,
        model='video')
        #enable_automatic_punctuation=True,
        #audio_channel_count=6,
        #enable_separate_recognition_per_channel=True)
        
    # Detects speech in the audio file
    response = client.recognize(config, audio)

    return response

Word Error Rate calculation based on jiwer: https://github.com/jitsi/asr-wer

In [746]:
def wer(human_transcript, asr_transcript):
    vocab = []

    for word in human_transcript + asr_transcript:
        if word not in vocab:
            vocab.append(word)
    
    # Represent transcripts as numbers
    h = []
    a = []

    for word in human_transcript:
        h.append(vocab.index(word))

    for word in asr_transcript:
        a.append(vocab.index(word))

    # Alignment
    distance = _edit_distance(h, a)

    # Calculate WER from edit distance
    n = len(human_transcript)
    word_error_rate = distance / n

    return word_error_rate

def _edit_distance(a, b):
    # Calculate edit distance based on Wagner-Fischer algorithm
    if len(a) == 0:
        raise ValueError('Reference string cannot be empty.')
    elif len(b) == 0:
        return len(a)

    # Initialize matrix and set the first row and column equal to 1, 2, 3, ...
    # Each column represents a single token in the reference string a
    # Each row represents a single token in the reference string b
    
    m = np.zeros((len(b) + 1, len(a) + 1), dtype=np.int32)

    m[0, 1:] = np.arange(1, len(a) + 1)
    m[1:, 0] = np.arange(1, len(b) + 1)

    # Loop over remaining cells (from second row and column onwards)
    # The value of each selected cell is:
    #   if token represented by row == token represented by column:
    #       value of the top-left diagonal cell
    #   else:
    #       calculate 3 values:
    #            * top-left diagonal cell + 1 (which represents substitution)
    #            * left cell + 1 (representing deleting)
    #            * top cell + 1 (representing insertion)
    #       value of the smallest of the three
    
    for i in range(1, m.shape[0]):
        for j in range(1, m.shape[1]):
            if a[j-1] == b[i-1]:
                m[i, j] = m[i-1, j-1]
            else:
                m[i, j] = min(
                    m[i-1, j-1] + 1,
                    m[i, j-1] + 1,
                    m[i-1, j] + 1
                )

    # The minimum-edit distance is the value of the bottom-right cell of matrix
    return m[len(b), len(a)]

In [754]:
fs, recording = wavfile.read('audio/Sony In lab Audio/32.wav')
recording = recording[:,0] # Grab 1st channel

# recording = recording[:,:6] # Grab 6 channels
# combined_channels = np.zeros(len(recording), dtype=np.int16)
# for col in range(recording.shape[1]):
#     combined_channels += recording[:,col]
# combined_channels //= 6
# recording = combined_channels

In [531]:
with open('audio/Sony In lab Audio/32.txt', 'r') as f:
    timestamps = []
    utterances = []
    for line in f.readlines():
        timestamp = line[1:9]
        line = re.sub('\[.*?\]', ' ', line) # remove timestamps and inaudible
        try:
            line = line[line.index(':'):]
        except ValueError:
            continue
        line = line.replace('...', ' ') # replace ellipses with spaces
        line = line.translate(str.maketrans('', '', '!"#&()*+,./:;<=>?@[\\]^_`{|}~')) # remove punctuation
        line = re.sub(' +', ' ', line) # replace double spaces with single spaces
        line = line.strip('\n ') # remove newline character and leading/trailing spaces
        timestamps.append(timestamp)
        utterances.append(line)

In [532]:
seconds = []
for time in timestamps:
    seconds.append(int(time[3:5]) * 60 + int(time[6:8]))

In [536]:
frames = [s * fs for s in seconds]

In [565]:
clips = []
for t in range(0, len(frames) - 1):
    if frames[t] == frames[t + 1]:
        clips.append(recording[frames[t]:frames[t + 2] + 16000])
    else:
        clips.append(recording[frames[t]:frames[t + 1] + 16000])

In [541]:
responses = []

for clip_num, clip in enumerate(clips):
    if len(clip) > 960000:
        print(f'Clip {clip_num} longer than 1 minute, split into segments')
        n = math.ceil(len(clip) / 960000)
        segments = np.array_split(clip, n)
        response_segments = []
        for segment in segments:
            response_segment = gcs(segment, fs)
            response_segments.append(response_segment)
        response = response_segments[0]
        for i in range(1, len(response_segments)):
            response.MergeFrom(response_segments[i])
    else:
        print(f'Getting response for clip {clip_num}')
        response = gcs(clip, fs)
    responses.append(response)

Getting response for clip 0
Getting response for clip 1
Getting response for clip 2
Getting response for clip 3
Getting response for clip 4
Getting response for clip 5
Getting response for clip 6
Getting response for clip 7
Getting response for clip 8
Getting response for clip 9
Getting response for clip 10
Getting response for clip 11
Getting response for clip 12
Getting response for clip 13
Getting response for clip 14
Getting response for clip 15
Getting response for clip 16
Getting response for clip 17
Getting response for clip 18
Getting response for clip 19
Getting response for clip 20
Getting response for clip 21
Getting response for clip 22
Getting response for clip 23
Getting response for clip 24
Getting response for clip 25
Getting response for clip 26
Getting response for clip 27
Getting response for clip 28
Getting response for clip 29
Getting response for clip 30
Getting response for clip 31
Getting response for clip 32
Getting response for clip 33
Getting response for cli

In [580]:
transcripts = []
for response in responses:
    transcript = ''
    for result in response.results:
        for alternative in result.alternatives:
            transcript = transcript + ' ' + alternative.transcript
    transcripts.append(transcript)

In [598]:
transcripts = [re.sub(' +', ' ', transcript) for transcript in transcripts]
transcripts = [transcript.strip() for transcript in transcripts]

In [738]:
human_transcripts = [utterance.lower() for utterance in utterances][:-1] # Omit last utterance
google_transcripts = [transcript.lower() for transcript in transcripts]

In [791]:
human_tokens = [transcript.split() for transcript in human_transcripts]
google_tokens = [transcript.split() for transcript in google_transcripts]

In [805]:
# Calculate WERs and normalize based on word count
total_wc = 0
overall_wer = 0

for h, g in zip(human_tokens, google_tokens):
    total_wc += len(h)
    overall_wer += wer(h, g) * len(h)
    
overall_wer /= total_wc

In [806]:
overall_wer

0.4813953488372093