<a href="https://colab.research.google.com/github/vfeng02/wav2vec_cantonese_xl/blob/main/wav2vec_xl_cantonese_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
%%capture
!pip install datasets==1.18.3
!pip install transformers==4.24.0
!pip install jiwer
!pip install torchaudio
!pip install librosa

In [None]:
from datasets import load_dataset, load_metric, Audio

# common_voice_train = load_dataset("common_voice", "zh-HK", split="train+validation")
common_voice_test = load_dataset("common_voice", "zh-HK", split="test")

In [None]:
# common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

model_id = "ctl/wav2vec2-large-xlsr-cantonese"

processor = Wav2Vec2Processor.from_pretrained(f"{model_id}") 
model = Wav2Vec2ForCTC.from_pretrained(f"{model_id}") 
model.to("cuda")


In [None]:
import re

chars_to_ignore_regex = '[\,\?\.\!\-\;\:"\“\%\‘\”\�\．\⋯\！\－\：\–\。\》\,\）\,\？\；\～\~\…\︰\，\（\」\‧\《\﹔\、\—\／\,\「\﹖\·\']'

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
    batch["speech"] = batch["audio"]['array'].squeeze()
    return batch

test_dataset = common_voice_test.map(speech_file_to_array_fn)

In [None]:
import torch 

def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch

result = test_dataset.map(evaluate, batched=True, batch_size=16)

cer = load_metric("cer")
print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))