## Usage

General Model

In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import Audio, load_dataset
import torch

processor = Wav2Vec2Processor.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-odia")
model = Wav2Vec2ForCTC.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-odia")
ds1 = load_dataset("mozilla-foundation/common_voice_11_0", "or", split="test")
ds1 = ds1.cast_column("audio", Audio(sampling_rate=16_000))

Some weights of the model checkpoint at anuragshas/wav2vec2-large-xlsr-53-odia were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at anuragshas/wav2vec2-large-xlsr-53-odia and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should prob

In [4]:
input_values = processor(ds1[0]["audio"]["array"], return_tensors="pt").input_values
with torch.no_grad():
    logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['ଏଇ କ ଏ କ']

-------------------------------------------------------------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------------------------------------------------------------

Ranjit Finetuned

In [5]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("Ranjit/odia_whisper_small_v3.0")
model = WhisperForConditionalGeneration.from_pretrained("Ranjit/odia_whisper_small_v3.0")
model.config.forced_decoder_ids = None

# load streaming dataset and read first audio sample
ds2 = load_dataset("mozilla-foundation/common_voice_11_0", "or", split="test")
ds2 = ds2.cast_column("audio", Audio(sampling_rate=16_000))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [6]:
input_features = processor(ds2[0]["audio"]["array"], sampling_rate=ds2[0]["audio"]["sampling_rate"], return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

['ିକା ଏକ']

-------------------------------------------------------------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------------------------------------------------------------

Another Model

In [7]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("Apocalypse-19/whisper-large-odiya")
model = WhisperForConditionalGeneration.from_pretrained("Apocalypse-19/whisper-large-odiya")
model.config.forced_decoder_ids = None

# load streaming dataset and read first audio sample
ds3 = load_dataset("mozilla-foundation/common_voice_11_0", "or", split="test")
ds3 = ds3.cast_column("audio", Audio(sampling_rate=16_000))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
input_features = processor(ds3[0]["audio"]["array"], sampling_rate=ds3[0]["audio"]["sampling_rate"], return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription

['ଏକ ଏକ ।']

## Chunking with Timestamps

In [9]:
import torch
from transformers import pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
  "automatic-speech-recognition",
  model="Ranjit/odia_whisper_small_v3.0",
  chunk_length_s=10,
  device=device,
)

dsC = load_dataset("mozilla-foundation/common_voice_11_0", "or", split="test")
dsC = dsC.cast_column("audio", Audio(sampling_rate=16_000))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [32]:
import torch
from transformers import pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"

#pipe = pipeline(
#  "automatic-speech-recognition",
#  model="Apocalypse-19/whisper-large-odiya",
#  chunk_length_s=10,
#  device=device,
#)

dsC2 = load_dataset("mozilla-foundation/common_voice_11_0", "or", split="test")
dsC2 = dsC2.cast_column("audio", Audio(sampling_rate=16_000))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
prediction = pipe(dsC[0]["audio"].copy(), batch_size=5)["text"]
prediction

'ିକା ଏକ'

In [11]:
prediction = pipe(dsC[9]["audio"].copy(), batch_size=5)["text"]
prediction

'ଦୁମ୍ ଦୁମ୍ ପୁରରେ କନ୍ୟାଟି ଘର ଯୋଗା ହେଲାଣି ।'

In [12]:
predictionT = pipe(dsC[0]["audio"].copy(), batch_size=5, return_timestamps=True)["chunks"]
predictionT

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


[{'timestamp': (0.0, 0.02), 'text': ''},
 {'timestamp': (5.0, None), 'text': ' ଏକ ଏକ'}]

In [13]:
predictionT = pipe(dsC[9]["audio"].copy(), batch_size=5, return_timestamps=True)["chunks"]
predictionT

[]

## Evaluation

In [1]:
# Imports
import torchaudio
from datasets import Audio, load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re

In [None]:
# Loading
dsX = load_dataset("mozilla-foundation/common_voice_11_0", "or", split="test")
dsX = dsX.cast_column("audio", Audio(sampling_rate=16_000))
wer = load_metric("wer")
processor = Wav2Vec2Processor.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-odia")
model = Wav2Vec2ForCTC.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-odia")
model.to("cuda")

In [None]:
# Evaluation
def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch
result = dsX.map(evaluate, batched=True, batch_size=8)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))