## Speech_to_Text notebook to try out models and build a class that eventually automates testing.

In [1]:
#import json
import matplotlib.pyplot as plt
from natsort import natsorted
import numpy as np
import os
#import scipy
import seaborn as sns
#from tqdm.notebook import tqdm
#from vosk import Model, KaldiRecognizer


#from importlib import reload
#from scipy.io import wavfile
#from IPython.display import Audio

#Import the speech-to-text models
from google_api import init_google, run_google
from vosk_api import init_vosk, run_vosk
from wav2vec2_api import init_wav2vec2, run_wav2vec2
from whisper_api import init_whisper, run_whisper

#Import metrics
from metrics import rtf, wer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from google_api import init_google, run_google
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_google()
out, time = run_google(file_path, setup)

In [10]:
from whisper_api import init_whisper, run_whisper
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_whisper()
out, time = run_whisper(file_path, setup)

In [1]:
from vosk_api import init_vosk, run_vosk
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_vosk()
out, time = run_vosk(file_path, setup)

In [1]:
from wav2vec2_api import init_wav2vec2, run_wav2vec2
file_path = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/7/2_7.wav"
setup = init_wav2vec2()
out, time = run_wav2vec2(file_path, setup)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tensor = as_tensor(value)


('get the soap from the bar navigate to the sink and bring it to jacob',
 1.696542501449585)

### Load paths of recordings to pass to speech_to_text class and create dict with ground truth

In [3]:
paths = []
for file in natsorted(os.listdir("H:/Speech_to_Text/7")):
    paths.append(file)

In [2]:
#test sentences (partially generated with the General Purpose Service Robot Command Generator)
tests = {1: "pick up the bag",
         2: "open the door",
         3: "touch the desk",
         4: "could you please hand the coke to lisa",
         5: "put the spoon next to the bowl",
         6: "place the tab inside the dishwasher",
         7: "get the soap from the bar navigate to the sink and bring it to jacob",
         8: "tell the time find the waving person in the living room, and escort him to the dining table",
         9: "could you locate alexander in the bed guide him to the apartment, and follow emma who is at the tv stand",
         10: "could you please take the left-most object from the bookcase to the shelf"}

### Init class speech_to_text 
(When finished) handles the initialization and running of the different models, as well as comparison to ground truth and calculation of metrics.

In [6]:
class speech_to_text():

    def __init__(self):
        #Initialize models that will be compared
        self.setup_whisper = init_whisper()
        self.setup_google = init_google()
        self.setup_vosk = init_vosk()
        self.setup_wav2vec2 = init_wav2vec2()
        print("All models initialized")
        self.counter = 0

    def load_data(self, file_directory, ground_truth):
        """
        - file_directory: directory that contains audio files (.wav)
        - ground truth: true text of the speech in the audio files
        """
        self.file_directory = file_directory
        self.ground_truth = ground_truth
        self.file_paths = [file for file in natsorted(os.listdir(file_directory))]

    def compare(self):
        out_dict = {}
        sum_dict = {"Whisper": {"WER": [], "RTF": []},
                    "Google": {"WER": [], "RTF": []},
                    "Vosk": {"WER": [], "RTF": []},
                    "Wav2vec2": {"WER": [], "RTF": []}
                    }
        for ix, file in enumerate(self.file_paths):
            file_name = "{}/{}".format(self.file_directory, file)

            #Run models
            out_whisper, time_whisper = run_whisper(file_name, self.setup_whisper)
            out_google, time_google = run_google(file_name, self.setup_google)
            out_vosk, time_vosk = run_vosk(file_name, self.setup_vosk)
            out_wav2vec2, time_wav2vec2 = run_wav2vec2(file_name, self.setup_wav2vec2)

            #Store metrics & model output in a nested dictionary
            test_num = str(ix+1)
            out_dict[test_num] = {
                "Whisper": {"WER": wer(self.ground_truth, out_whisper), 
                            "RTF": rtf(time_whisper, file_name), 
                            "Model Output": out_whisper},
                "Google": {"WER": wer(self.ground_truth, out_google), 
                           "RTF": rtf(time_google, file_name),
                           "Model Output": out_google},
                "Vosk": {"WER": wer(self.ground_truth, out_vosk),
                         "RTF": rtf(time_vosk, file_name), 
                         "Model Output": out_vosk},
                "Wav2vec2": {"WER": wer(self.ground_truth, out_wav2vec2), 
                             "RTF": rtf(time_wav2vec2, file_name),
                             "Model Output": out_wav2vec2}
                }
            
            #Keep track of the metrics for each model to later summarize
            for model in sum_dict.keys():
                for metric in sum_dict[model].keys():
                    sum_dict[model][metric].append(out_dict[test_num][model][metric])
        
        for model in sum_dict.keys():
                for metric in sum_dict[model].keys():
                    #Calculate mean for each metric for each model
                    sum_dict[model][metric] = round(sum(sum_dict[model][metric])/len(sum_dict[model][metric]),2)
        
        #Save output (better than returning when running all models with all audios in a loop)
        np.save("comparisons/{}_detailed.npy".format(str(self.counter)),out_dict)
        np.save("comparisons/{}_summarized.npy".format(str(self.counter)),sum_dict)
        self.counter += 1
        #return out_dict, sum_dict

In [5]:
#out = np.load("first_test.npy", allow_pickle=True)

In [7]:
test_class = speech_to_text()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All models initialized


In [5]:
#out = test_class.compare()

  tensor = as_tensor(value)


In [8]:

for i in np.arange(1,11,1):
    dir = "C:/Users/Daydreamore/Desktop/Semester/speech_recognition/recordings/{}".format(str(i))
    test_class.load_data(file_directory=dir, ground_truth=tests[i])
    test_class.compare()