In [None]:
!pip install transformers

In [None]:
# Download sample audio file
!gdown --id 1GkDBSWODEIxYijPua_hL4jRP3iquYFIA

Downloading...
From: https://drive.google.com/uc?id=1GkDBSWODEIxYijPua_hL4jRP3iquYFIA
To: /content/p225_003.wav
  0% 0.00/192k [00:00<?, ?B/s]100% 192k/192k [00:00<00:00, 29.4MB/s]


## Wav2Vec2.0 for Hidden Representations

In [None]:
import soundfile as sf
import torch
from transformers import Wav2Vec2Model, Wav2Vec2Tokenizer, Wav2Vec2ForMaskedLM, Wav2Vec2ForCTC

# load pretrained model
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# load audio
# audio_input, _ = sf.read("path/to/audio/file")

audio_input, _ = sf.read("/content/p225_003.wav")


# Applying tokenizer
input_values = tokenizer(audio_input, return_tensors="pt").input_values 

# Run Model
hidden_state = model(input_values).last_hidden_state

print(hidden_state.shape)

torch.Size([1, 300, 768])


**Note**: InOrder to train any new model with lesser amount of training data we can use these hidden representations and directly feed into our network instead of using feature extractors like mfcc, filter banks..

## Wa2Vec2.0 with MaskedLM Head

In [None]:
# load audio
# audio_input, _ = sf.read("path/to/audio/file")

model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
audio_input, _ = sf.read("/content/p225_003.wav")

# transcribe
input_values = tokenizer(audio_input, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]

print(transcription)



SIX SPOONS OF FRESH SNOW PEESE FIVE THIK SLABS OF BLUE CHEESE AND MAY BE A STACK FOR HER BROTHER BOB


## Wav2Vec2.0 with CTC

In [None]:
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


input_values = tokenizer(audio_input, return_tensors="pt").input_values  # Batch size 1
logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.decode(predicted_ids[0])

print(transcription)

SIX SPOONS OF FRESH SNOW PEESE FIVE THIK SLABS OF BLUE CHEESE AND MAY BE A STACK FOR HER BROTHER BOB
