In [None]:
# Play a simple audio file
import IPython.display as display
path = "/mnt/sda/audio/simple/M1F1-Alaw-AFsp.wav"
display.Audio(path, autoplay=True)

# "Seed is needed to plant the spring corn"

In [None]:
# For managing audio file
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [None]:
# Loading the audio file
audio, rate = librosa.load(path, sr = 16000)
print(audio)
print(rate)

In [None]:
# Importing Wav2Vec pretrained model
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
# Taking an input value
input_values = tokenizer(audio, return_tensors = "pt").input_values
# Getting the logit values (non-normalized values)
logits = model(input_values).logits
# Passing the logit values to softmax to get the predicted values
prediction = torch.argmax(logits, dim = -1)
# The final step is to pass the prediction to the tokenizer decode to get the transcription
transcription = tokenizer.batch_decode(prediction)[0]
print(transcription)
# The output is "OUR FEVRIN HAG IS AN YOU WILL LAB OFF FRING CORN" which is not even close to "Seed is needed to plant the spring corn".
# Lets try something better.

In [None]:
from datasets import load_dataset
from transformers import pipeline

# replace following lines to load an audio file of your choice
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
audio_file = ds[0]["file"]

print(audio_file)
display.Audio(audio_file, autoplay=True)

In [None]:
from datasets import load_dataset
from transformers import pipeline

asr = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-xls-r-300m-21-to-en", feature_extractor="facebook/wav2vec2-xls-r-300m-21-to-en")

In [None]:
translation = asr(audio_file)

In [None]:
MAPPING = {
    "de": 250003,
    "tr": 250023,
    "fa": 250029,
    "sv": 250042,
    "mn": 250037,
    "zh": 250025,
    "cy": 250007,
    "ca": 250005,
    "sl": 250052,
    "et": 250006,
    "id": 250032,
    "ar": 250001,
    "ta": 250044,
    "lv": 250017,
    "ja": 250012,
}

In [None]:
import torch
from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel
from datasets import load_dataset

model_ckpt = "facebook/wav2vec2-xls-r-300m-en-to-15"

model = SpeechEncoderDecoderModel.from_pretrained(model_ckpt)
processor = Wav2Vec2Processor.from_pretrained(model_ckpt)

ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# select correct `forced_bos_token_id`
forced_bos_token_id = MAPPING["sv"]
print(forced_bos_token_id)

print("Done")

In [None]:
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
audio_file = ds[0]["file"]
display.Audio(audio_file, autoplay=True)

In [31]:
input_values = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt").input_values

# Inference: Translate English speech to German
generated = model.generate(input_values)
decoded = processor.batch_decode(generated, skip_special_tokens=True)[0]
decoded

'Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.'