## Speech_to_Text notebook that displays short examples of the different models before defining the class that is used to initialize the models and run the comparison.

### Load libraries

In [1]:
#Import general libraries for sorting, computing, interacting with OS
from natsort import natsorted
import numpy as np
import os

#Import the speech-to-text models
from google_api import init_google, run_google
from vosk_api import init_vosk, run_vosk
from wav2vec2_api import init_wav2vec2, run_wav2vec2
from whisper_api import init_whisper, run_whisper

#Import metrics
from metrics import rtf, wer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_wav2vec2(model_name = "facebook/wav2vec2-large-robust-ft-libri-960h")
out, time = run_wav2vec2(file_path, setup)

Downloading: 100%|██████████| 212/212 [00:00<00:00, 30.4kB/s]
Downloading: 100%|██████████| 292/292 [00:00<00:00, 73.0kB/s]
Downloading: 100%|██████████| 181/181 [00:00<00:00, 30.2kB/s]
Downloading: 100%|██████████| 85.0/85.0 [00:00<00:00, 41.6kB/s]
Downloading: 100%|██████████| 1.54k/1.54k [00:00<00:00, 522kB/s]
Downloading: 100%|██████████| 1.18G/1.18G [01:39<00:00, 12.7MB/s]


### Short code snippets that allow running each model separately:

In [6]:
from google_api import init_google, run_google
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_google()
out, time = run_google(file_path, setup)

In [None]:
from whisper_api import init_whisper, run_whisper
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_whisper()
out, time = run_whisper(file_path, setup)

In [1]:
from vosk_api import init_vosk, run_vosk
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_vosk()
out, time = run_vosk(file_path, setup)

In [1]:
from wav2vec2_api import init_wav2vec2, run_wav2vec2
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_wav2vec2()
out, time = run_wav2vec2(file_path, setup)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tensor = as_tensor(value)


('get the soap from the bar navigate to the sink and bring it to jacob',
 1.696542501449585)

## Comparison Code:

### Ground Truth for the recorded sentences

In [2]:
#test sentences (partially generated with the General Purpose Service Robot Command Generator)
tests = {1: "pick up the bag",
         2: "open the door",
         3: "touch the desk",
         4: "could you please hand the coke to lisa",
         5: "put the spoon next to the bowl",
         6: "place the tab inside the dishwasher",
         7: "get the soap from the bar navigate to the sink and bring it to jacob",
         8: "tell the time find the waving person in the living room, and escort him to the dining table",
         9: "could you locate alexander in the bed guide him to the apartment, and follow emma who is at the tv stand",
         10: "could you please take the leftmost object from the bookcase to the shelf"}

### Comparison Class "speech_to_text"
Handles the initialization and running of the different models, as well as comparison to ground truth and calculation of metrics.

True

In [3]:
class speech_to_text():

    def __init__(self):
        #Initialize models that will be compared
        self.setup_whisper = init_whisper()
        self.setup_google_default = init_google()
        self.setup_google_cas = init_google(model="command_and_search")
        self.setup_google_video = init_google(model="video")
        self.setup_vosk = init_vosk()
        self.setup_wav2vec2 = init_wav2vec2()
        self.setup_wav2vec2_update = init_wav2vec2(model_name = "facebook/wav2vec2-large-robust-ft-libri-960h")
        print("All models initialized")
        self.counter = 0

    def load_data(self, file_directory, ground_truth):
        """
        - file_directory: directory that contains audio files (.wav)
        - ground truth: true text of the speech in the audio files
        """
        self.file_directory = file_directory
        self.ground_truth = ground_truth
        self.file_paths = [file for file in natsorted(os.listdir(file_directory))]

    def compare(self):
        """
        Compares the different models' outputs in terms of
        Word Error Rate (WER) and Real-Time Factor (RTF)
        """ 
        #Check if output folder exists. If not, it is created
        if not os.path.exists("comparisons"):
            os.mkdir("comparisons")

        #Create output containers
        out_dict = {}
        sum_dict = {"Whisper": {"WER": [], "RTF": []},
                    "Google_default": {"WER": [], "RTF": []},
                    "Google_CAS": {"WER": [], "RTF": []},
                    "Google_video": {"WER": [], "RTF": []},
                    "Vosk": {"WER": [], "RTF": []},
                    "Wav2vec2": {"WER": [], "RTF": []},
                    "Wav2vec2_update": {"WER": [], "RTF": []}
                    }
                    
        for ix, file in enumerate(self.file_paths):
            file_name = "{}/{}".format(self.file_directory, file)

            #Run models
            out_whisper, time_whisper = run_whisper(file_name, self.setup_whisper)
            out_google_default, time_google_default = run_google(file_name, self.setup_google_default)
            out_google_cas, time_google_cas = run_google(file_name, self.setup_google_cas)
            out_google_video, time_google_video = run_google(file_name, self.setup_google_video)
            out_vosk, time_vosk = run_vosk(file_name, self.setup_vosk)
            out_wav2vec2, time_wav2vec2 = run_wav2vec2(file_name, self.setup_wav2vec2)
            out_wav2vec2_update, time_wav2vec2_update = run_wav2vec2(file_name, self.setup_wav2vec2_update)

            #Store metrics & model output in a nested dictionary
            test_num = str(ix+1)
            out_dict[test_num] = {
                "Whisper": {"WER": wer(self.ground_truth, out_whisper), 
                            "RTF": rtf(time_whisper, file_name), 
                            "Model Output": out_whisper},
                "Google_default": {"WER": wer(self.ground_truth, out_google_default), 
                           "RTF": rtf(time_google_default, file_name),
                           "Model Output": out_google_default},
                "Google_CAS": {"WER": wer(self.ground_truth, out_google_cas), 
                           "RTF": rtf(time_google_cas, file_name),
                           "Model Output": out_google_cas},
                "Google_video": {"WER": wer(self.ground_truth, out_google_video), 
                           "RTF": rtf(time_google_video, file_name),
                           "Model Output": out_google_video},
                "Vosk": {"WER": wer(self.ground_truth, out_vosk),
                         "RTF": rtf(time_vosk, file_name), 
                         "Model Output": out_vosk},
                "Wav2vec2": {"WER": wer(self.ground_truth, out_wav2vec2), 
                             "RTF": rtf(time_wav2vec2, file_name),
                             "Model Output": out_wav2vec2},
                "Wav2vec2_update": {"WER": wer(self.ground_truth, out_wav2vec2_update), 
                             "RTF": rtf(time_wav2vec2_update, file_name),
                             "Model Output": out_wav2vec2_update}
                }
            
            #Keep track of the metrics for each model to later summarize
            for model in sum_dict.keys():
                for metric in sum_dict[model].keys():
                    sum_dict[model][metric].append(out_dict[test_num][model][metric])
        
        for model in sum_dict.keys():
                for metric in sum_dict[model].keys():
                    #Calculate mean for each metric for each model
                    sum_dict[model][metric] = round(sum(sum_dict[model][metric])/len(sum_dict[model][metric]),2)
        
        #Save output (better than returning when running all models with all audios in a loop)
        np.save("comparisons/{}_detailed.npy".format(str(self.counter)),out_dict)
        np.save("comparisons/{}_summarized.npy".format(str(self.counter)),sum_dict)
        self.counter += 1

        #in case immediate output is desired
        #return out_dict, sum_dict 

### Run comparison

In [4]:
#Initialize class
test_class = speech_to_text()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All models initialized


In [7]:

#Iterates over recording folder, loads data, and runs comparison
for i in np.arange(1,11,1):
    dir = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/{}".format(str(i))
    test_class.load_data(file_directory=dir, ground_truth=tests[i])
    test_class.compare()

### After comparison: Load saved data of comparison

Summarized Report (note that the last sentence was rerun later because one of the recordings contained more than just the test sentence and thereby drastically increased the average WER):

In [13]:
import numpy as np
for i in range(10):
    print(tests[i+1])
    print(np.load("comparisons/{}_summarized.npy".format(i), allow_pickle=True).item())
    print("\n")


pick up the bag
{'Whisper': {'WER': 0.7, 'RTF': 0.66}, 'Google_default': {'WER': 1.34, 'RTF': 0.6}, 'Google_CAS': {'WER': 1.12, 'RTF': 0.6}, 'Google_video': {'WER': 1.0, 'RTF': 0.53}, 'Vosk': {'WER': 1.38, 'RTF': 0.99}, 'Wav2vec2': {'WER': 1.46, 'RTF': 0.36}, 'Wav2vec2_update': {'WER': 0.8, 'RTF': 0.33}}


open the door
{'Whisper': {'WER': 0.1, 'RTF': 0.66}, 'Google_default': {'WER': 1.0, 'RTF': 0.51}, 'Google_CAS': {'WER': 1.0, 'RTF': 0.52}, 'Google_video': {'WER': 1.0, 'RTF': 0.52}, 'Vosk': {'WER': 1.36, 'RTF': 1.03}, 'Wav2vec2': {'WER': 1.14, 'RTF': 0.36}, 'Wav2vec2_update': {'WER': 0.93, 'RTF': 0.33}}


touch the desk
{'Whisper': {'WER': 2.56, 'RTF': 1.02}, 'Google_default': {'WER': 2.46, 'RTF': 0.83}, 'Google_CAS': {'WER': 2.46, 'RTF': 0.84}, 'Google_video': {'WER': 1.44, 'RTF': 0.65}, 'Vosk': {'WER': 2.15, 'RTF': 1.85}, 'Wav2vec2': {'WER': 2.05, 'RTF': 0.37}, 'Wav2vec2_update': {'WER': 2.21, 'RTF': 0.34}}


could you please hand the coke to lisa
{'Whisper': {'WER': 1.02, 'RTF': 0

In [11]:
#Iterates over recording folder, loads data, and runs comparison
i = 10
dir = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/{}".format(str(i))
test_class.load_data(file_directory=dir, ground_truth=tests[i])
test_class.compare()

Detailled Reports:

In [None]:
import numpy as np
for i in range(10):
    print(tests[i+1])
    print(np.load("comparisons/{}_detailed.npy".format(i), allow_pickle=True).item())
    print("\n")