## Speech_to_Text notebook to try out models and build a class that eventually automates testing.

In [5]:
import json
import matplotlib.pyplot as plt
from natsort import natsorted
import numpy as np
import os
import scipy
import seaborn as sns
from tqdm.notebook import tqdm
from vosk import Model, KaldiRecognizer


from importlib import reload
from scipy.io import wavfile
from IPython.display import Audio

### Load paths of recordings to pass to speech_to_text class and create dict with ground truth

In [3]:
paths = []
for file in natsorted(os.listdir("H:/Speech_to_Text/7")):
    paths.append(file)

In [27]:
#test sentences (partially generated with the General Purpose Service Robot Command Generator)
tests = {1: "pick up the bag",
         2: "open the door",
         3: "touch the desk",
         4: "could you please hand the coke to lisa",
         5: "put the spoon next to the bowl",
         6: "place the tab inside the dishwasher",
         7: "get the soap from the bar navigate to the sink and bring it to jacob",
         8: " tell the time find the waving person in the living room, and escort him to the dining table",
         9: "could you locate alexander in the bed guide him to the apartment, and follow emma who is at the tv stand",
         10: "could you please take the left-most object from the bookcase to the shelf"}

### Init class speech_to_text 
(When finished) handles the initialization and running of the different models, as well as comparison to ground truth and calculation of metrics.

In [None]:
class speech_to_text():

    def __init__(self, models, ground_truth):
        from tqdm.notebook import tqdm
        self.models = models
        self.detected_words = {}
        self.ground_truth = ground_truth

    def load_data(self, file_directory, fs = 44100):
        from natsort import natsorted
        import os
        self.file_directory = file_directory
        self.file_paths = [file for file in natsorted(os.listdir(file_directory))]
        self.fs = fs


    #VOSK block
    def init_vosk(self, model_path = "vosk-model-en-us-0.42-gigaspeech"):
        import json
        from vosk import Model, KaldiRecognizer
        assert os.path.exists(model_path), "model not in current path"
        model = Model(model_path)
        self.models = KaldiRecognizer(model, sample_rate)
        self.models.SetWords(True)

    def run_vosk(self, n_frames = 4000, thresh = 0):
        assert type(self.models) == vosk.KaldiRecognizer, "First initialize vosk model using 'init_vosk'"
        from scipy.io import wavfile
        words = {k: [] for k in range(len(self.file_paths))}
        confs = words.copy()

        for ix, file in tqdm(enumerate(self.file_paths)):
            _, audio = wavfile.read("{}/{}".format(self.file_directory, file))
            audio_bytes = bytes(bytearray(audio))

            #Run Speech Recognizer
            i = 0
            while True:
                data = audio_bytes[n_frames*i:n_frames*(i+1)]
                i += 1

                if data == b"": #if indexing is out of range
                    break

                if self.models.AcceptWaveform(data):
                    try: #if section does not contain a word, we get KeyError
                        instance = json.loads(rec.Result())
                        #if min. confidence, append detected words to lists
                        for entry in instance["result"]:
                            if entry["conf"] > thresh:
                                confs[ix].append(entry["conf"])
                                words[ix].append(entry["word"])

                    except KeyError:
                        continue

            #last detected word is not in rec.Result(), therefore...
            last_instance = json.loads(rec.FinalResult())
            for entry in last_instance["result"]:
                if entry["conf"] > thresh:
                    confs[ix].append(entry["conf"])
                    words[ix].append(entry["word"])



    #Whisper block
    def init_whisper(self, mode = "base"):
        """
        mode: "tiny", "base", "small", "medium", "large"
        """
        import whisper
        #CUDA_LAUNCH_BLOCKING=1
        model = whisper.load_model(mode)

    def run_whisper(self):
        """
        Note: Internally, the transcribe() method reads the entire file and processes the audio 
            with a sliding 30-second window, performing autoregressive sequence-to-sequence predictions 
            on each window.
        """
        words = {k: [] for k in range(len(self.file_paths))}
        for ix, file in tqdm(enumerate(self.file_paths)):
            audio = whisper.load_audio(file)
            audio = whisper.pad_or_trim(audio) #pad/trim to fit 30 seconds
            words[ix]  = model.transcribe(audio)["text"]
    

    #Google Cloud Speech API block
    def init_google(self, model = "default", key = "google_key.json"):
        """
        model: "default", "video", "command_and_search"
        key: path to Google Service key [json]
        Note: Cloud Speech-to-Text API needs to be enabled in your account!
        """
        from google.cloud import speech
        assert os.path.exists(key), "path to key does not exist"
        client = speech.SpeechClient.from_service_account_file(key)

        #Note: for non-wav (or non-flac) files, an encoding parameter must be passed to config
        config = speech.RecognitionConfig(
            sample_rate_hertz = 44100,
            language_code = "en-US",
            #model = default
        )

    def run_google():
        words = {k: [] for k in range(len(self.file_paths))}
        for ix, file in tqdm(enumerate(self.file_paths)):
            with open(file, "rb") as f:
                audio = f.read()
                audio_file = speech.RecognitionAudio(content = audio)
                response = client.recognize(
                    config = config,
                    audio = audio_file
                )
                words[ix] = response.results[0].alternatives[0].transcript        

### Model Playground (before putting them into the class)

In [15]:
import wave
obj = wave.open("H:/Speech_to_Text/7/2_7.wav")

In [19]:
#!pip install vosk
import vosk

In [20]:
fs, audio = wavfile.read("C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav")

## Vosk (based on Kaldi toolbox)
Note: there are 2 different models!

In [22]:
#Make sure the "model" folder is in current path
#model = "GigaSpeech_ASR_XL"
model = "vosk-model-en-us-0.42-gigaspeech"

os.chdir("C:/Users/Daydreamore/Desktop/pythonProject/Speech_to_Text")
assert os.path.exists(model), "model not in current path"

#Initialize model
model = Model(model)

In [None]:
sample_rate = 44100
rec = KaldiRecognizer(model, sample_rate)

In [23]:
#Initialize KaldiRecognizer
sample_rate = 44100
audio_bytes = bytes(bytearray(audio))
n_frames = 4000
thresh = 0
conf = [] #store confidence of detected word
word = [] #store words

rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)

#Initialize a time bar to see progress when analyzing
file_size = len(audio)
pbar = tqdm(total=file_size)

#Run Speech Recognizer
i = 0
while True:
    data = audio_bytes[n_frames*i:n_frames*(i+1)]
    i += 1
    pbar.update(len(data)/2) #updates progress bar

    if data == b"": #if indexing is out of range, we get b"" as output
        break

    if rec.AcceptWaveform(data):
        try: #if section does not contain a word, we get KeyError
            instance = json.loads(rec.Result())
            #if min. confidence, append detected words to lists
            for entry in instance["result"]:
                if entry["conf"] > thresh:
                    conf.append(entry["conf"])
                    word.append(entry["word"])

        except KeyError: #catch error
            continue

#last detected word is not in rec.Result(), therefore...
last_instance = json.loads(rec.FinalResult())
for entry in last_instance["result"]:
    if entry["conf"] > thresh:
        conf.append(entry["conf"])
        word.append(entry["word"])

  0%|          | 0/209955 [00:00<?, ?it/s]

In [24]:
word

['get',
 'the',
 'soap',
 'from',
 'the',
 'bar',
 'navigate',
 'to',
 'the',
 'sink',
 'and',
 'bring',
 'it',
 'to',
 'jacob']

## Whisper (Open AI)

In [6]:
#pip install git+https://github.com/openai/whisper.git

In [4]:
import whisper

In [None]:
#wav_audio = "C:/Users/Daydreamore/Desktop/Semester/Speech_to_Text/1/2_1.wav"
#CUDA_LAUNCH_BLOCKING=1
model = whisper.load_model("base")
audio = whisper.load_audio("C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav") #"C:/Users/Daydreamore/Desktop/record.mp3"
audio = whisper.pad_or_trim(audio) #pad/trim to fit 30 seconds
#audio = audio.astype(dtype="float32")
result = model.transcribe(audio)

In [39]:
result["text"]

' Get the soap from the bar navigate to the sink and bring it to Jacob'

## Google Cloud Speech to Text API ($)

Audio data can either be loaded from Google Cloud Storage or directly if the audio length is <1 minute and <10mb. 

In [34]:
#pip install --upgrade google-cloud-speech
#Documentation: https://cloud.google.com/speech-to-text/docs/speech-to-text-requests
from google.cloud import speech
client = speech.SpeechClient.from_service_account_file("google_key.json")

file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/7_7.wav"

with open(file_path, "rb") as f:
    audio_data = f.read()

audio_file = speech.RecognitionAudio(content = audio_data)

#Note: for non-wav (or flac) files, an encoding parameter must be passed to config
config = speech.RecognitionConfig(
    sample_rate_hertz = 44100,
    language_code = "en-US",
    #model = default
)

response = client.recognize(
    config = config,
    audio = audio_file,
)

print(response.results[0].alternatives[0].transcript)

get the soap from the bar navigate to the sink and bring it to Jacob
