#**Vietnamese Speech Recognition with Wav2Vec**
Nhận diện giọng nói tiếng Việt với Wav2Vec

BÀI TẬP LỚN MÔN HỌC MÁY

---

**Họ và tên SV 1:** Nguyễn Viết Tuấn

**MSSV SV 1:** 22001363

---

**Họ và tên SV 2:** Nguyễn Minh Hiếu

**MSSV SV 2:** 22001321

---

**Họ và tên SV 3:** Nguyễn Tiến Đồng

**MSSV SV 3:** 22001314

---

**Họ và tên SV 4:** Vũ Trung Kiên

**MSSV SV 4:** 22001325

---

# Tải mô hình về

In [None]:
!pip install transformers gradio evaluate jiwer



In [None]:
!pip install -U datasets
!pip uninstall -y peft
# Lần đầu chạy thì khởi động lại phiên sau khi thực hiện 2 ô code trên

[0m

In [None]:
from transformers import AutoModelForCTC
from transformers import AutoConfig
from transformers import AutoFeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import AutoTokenizer
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_checkpoint = "Tuan457/tuan_vntts_test"

config = AutoConfig.from_pretrained(model_checkpoint)

tokenizer_type = config.model_type if config.tokenizer_class is None else None
config = config if config.tokenizer_class is not None else None


feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)


tokenizer = AutoTokenizer.from_pretrained(
  model_checkpoint,
  config=config,
  tokenizer_type=tokenizer_type,
  unk_token="[UNK]",
  pad_token="[PAD]",
  word_delimiter_token="|",
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
model = AutoModelForCTC.from_pretrained(
    model_checkpoint,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.


# Xử lý dữ liệu tập validation

In [None]:
from datasets import load_dataset, Audio
from evaluate import load as load_metric

In [7]:
data = load_dataset("mozilla-foundation/common_voice_11_0", "vi", split="validation[:100]")
#Do you wish to run the custom code? [y/N] y

In [8]:
data = data.cast_column("audio", Audio(sampling_rate=16_000))

In [9]:
def VietnameseSpeechRecognition(arrTensor):
  inputs = processor(arrTensor, sampling_rate=16000, return_tensors="pt")
  inputs = inputs.to(device)
  with torch.no_grad():
      logits = model(**inputs).logits
      # Ket qua du doan
      predicted_ids = torch.argmax(logits, dim=-1)
      transcription = processor.batch_decode(predicted_ids)
      return transcription[0]

In [10]:
# Chon audio de test
import IPython.display as ipd
import numpy as np
import random
audio_sample = data[19]["audio"]["array"]
ipd.Audio(data=audio_sample, autoplay=False, rate=16000)

In [11]:
print(VietnameseSpeechRecognition(audio_sample))

quân mừa mắt gia mới biết bình vừa nằm mơ


# Làm giao diện phần mềm

In [12]:
import os

import numpy as np

import gradio as gr
import scipy.signal as sps

def ReSampleAudio(data,current_rate,new_rate=16000):
  samples = round(len(data) * float(new_rate) / current_rate)
  return sps.resample(data, samples)

def reverse_audio(audio):
    sr, dataArr = audio
    dataArr = ReSampleAudio(dataArr,sr)
    inputs = torch.tensor(dataArr)
    return VietnameseSpeechRecognition(inputs)


demo = gr.Interface(fn=reverse_audio,
                    inputs="microphone",
                    outputs="text",
                    examples=[
                    data[18]["path"],
                    data[92]["path"],
                    data[55]["path"],
                    data[47]["path"],
                    data[26]["path"],
                    ],
                     cache_examples=True,
                    )


demo.launch()
# truy cập đường link ở dưới

Using cache from '/content/.gradio/cached_examples/13' directory. If method or examples have changed since last caching, delete this folder to clear cache.

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c2bd11e27df87ae8ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# So sánh với mô hình khác

In [14]:
!huggingface-cli login
# token: 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
# git_cridential = Y


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineG

In [21]:
import torch
import torchaudio
from datasets import load_dataset, Audio
from evaluate import load as load_metric
from transformers import Wav2Vec2FeatureExtractor
import re
test_dataset = load_dataset("mozilla-foundation/common_voice_11_0", "vi", split="test")
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wer = load_metric("wer")
chars_to_ignore_regex = r'[,?.!\-;:"“%\'�]'
# Preprocessing the datasets.
# We need to read the audio files as arrays

def speech_file_to_array_fn(batch):
  audio = batch["audio"]
  batch["target_text"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
  batch['speech'] = audio['array']
  return batch
test_dataset = test_dataset.map(speech_file_to_array_fn)

def evaluate(batch):
  inputs = processor(batch['speech'], sampling_rate=16000, return_tensors="pt",padding=True, do_normalize=False)
  inputs = inputs.to(device)
  with torch.no_grad():
      logits = model(**inputs).logits
      # Ket qua du doan
      predicted_ids = torch.argmax(logits, dim=-1)
      transcription = processor.batch_decode(predicted_ids)
      batch["pred_strings"] = transcription[0]

  return batch
result = test_dataset.map(evaluate)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["target_text"])))

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

WER: 17.527696
