# Speech to Text

Convert speech to text with huggingface.

Compare performance for wav2vec2 versus whisper.

In [1]:
import os
import glob
import pandas as pd
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
path = '/Users/tracy.reuter/Desktop/speech_samples'

## Using Wav2Vec2

In [2]:
# use wav2vec2 processor and pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
wav = [] # list to store .wav filenames
txt = [] # list to store .txt transcriptions
for filename in sorted(glob.glob(os.path.join(path, '*.wav'))):
    # load the audio file (.wav format, 16 kHz sampling rate)
    audio, rate = librosa.load(filename, sr = 16000)
    # get input_values
    input_values = processor(audio, return_tensors = "pt", sampling_rate = 16000).input_values
    # use the model to generate predicted token ids from the input_values
    # logits are non-normalized prediction values
    prediction = torch.argmax((model(input_values).logits), dim = -1)
    # decode token ids to text
    transcription = processor.batch_decode(prediction)[0]
    # append results
    wav.append(filename)
    txt.append(transcription)
# summarize results
df_wav2vec2 = pd.DataFrame({'wav_input': wav, 'txt_output': txt})

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Using OpenAI Whisper

In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# Load the Whisper model in Hugging Face format:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
wav = [] # list to store .wav filenames
txt = [] # list to store .txt transcriptions
for filename in sorted(glob.glob(os.path.join(path, '*.wav'))):
    # load the audio file (.wav format, 16 kHz sampling rate)
    audio, rate = librosa.load(filename, sr = 16000)
    # get input_features
    input_features = processor(audio, return_tensors = "pt", sampling_rate = 16000).input_features
    # use the model to generate predicted token ids from the input_features
    predicted_ids = model.generate(input_features)
    # decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
    # append results
    wav.append(filename)
    txt.append(transcription)
# summarize results
df_whisper = pd.DataFrame({'wav_input': wav, 'txt_output': txt})

## Comparing Results

In [4]:
pd.set_option('display.max_colwidth', None)
print(df_wav2vec2.to_string(index=False))

                                                 wav_input                    txt_output
  /Users/tracy.reuter/Desktop/speech_samples/the.apple.wav   LOOK AT THE WONDERFUL APPLE
 /Users/tracy.reuter/Desktop/speech_samples/the.apples.wav   LOOK AT THE WONDERFUL APPLE
   /Users/tracy.reuter/Desktop/speech_samples/the.baby.wav    LOOK AT THE BEAUTIFUL BABY
  /Users/tracy.reuter/Desktop/speech_samples/the.babys.wav  LOOK AT THE BEAUTIFUL BABIES
   /Users/tracy.reuter/Desktop/speech_samples/the.bike.wav    LOOK AT THE WONDERFUL BIKE
  /Users/tracy.reuter/Desktop/speech_samples/the.bikes.wav   LOOK AT THE WONDERFUL BIKES
 /Users/tracy.reuter/Desktop/speech_samples/the.cookie.wav  LOOK AT THE WONDERFUL COOKIE
/Users/tracy.reuter/Desktop/speech_samples/the.cookies.wav LOOK AT THE WONDERFUL COOKIES
  /Users/tracy.reuter/Desktop/speech_samples/the.doggy.wav   LOOK AT THE BEAUTIFUL DOGGY
 /Users/tracy.reuter/Desktop/speech_samples/the.doggys.wav LOOK AT THE BEAUTIFUL DOGGIYS
  /Users/tracy.reuter

In [5]:
print(df_whisper.to_string(index=False))

                                                 wav_input                        txt_output
  /Users/tracy.reuter/Desktop/speech_samples/the.apple.wav   [ Look at the wonderful apple.]
 /Users/tracy.reuter/Desktop/speech_samples/the.apples.wav  [ Look at the wonderful apples.]
   /Users/tracy.reuter/Desktop/speech_samples/the.baby.wav    [ Look at the beautiful baby.]
  /Users/tracy.reuter/Desktop/speech_samples/the.babys.wav  [ Look at the beautiful babies.]
   /Users/tracy.reuter/Desktop/speech_samples/the.bike.wav    [ Look at the wonderful bike.]
  /Users/tracy.reuter/Desktop/speech_samples/the.bikes.wav   [ Look at the wonderful bikes.]
 /Users/tracy.reuter/Desktop/speech_samples/the.cookie.wav  [ Look at the wonderful cookie.]
/Users/tracy.reuter/Desktop/speech_samples/the.cookies.wav [ Look at the wonderful cookies.]
  /Users/tracy.reuter/Desktop/speech_samples/the.doggy.wav   [ Look at the beautiful doggy.]
 /Users/tracy.reuter/Desktop/speech_samples/the.doggys.wav [ Look at t

## Conclusion

IMO, Whisper beats Wav2Vec2 in at least 3 ways:

1. More performant.

- Transcribed 20% faster.

- Future enhancements could increase speed.

2. More accurate.

- Transcribed "apple" versus "apples" correctly.

- Spelled "doggies" correctly as "doggies", not as "DOGGIYS".

3. More nuanced.

- Transcribed 3 sentences with emphatic punctuation (! instead of .)

- Punctuation indicates emphasis and emotion, useful for downstream sentiment analysis.