In [1]:
# !pip install transformers==4.20.0
# !pip install https://github.com/kpu/kenlm/archive/master.zip
# !pip install pyctcdecode==v0.4.0

In [3]:
from transformers.file_utils import cached_path, hf_bucket_url
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, Wav2Vec2ProcessorWithLM
import librosa
import torch
import numpy as np
import re
import json
from tqdm.auto import tqdm

from importlib.machinery import SourceFileLoader

In [4]:
model_name = "nguyenvulebinh/wav2vec2-large-vi-vlsp2020" # "nguyenvulebinh/wav2vec2-base-vietnamese-250h"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
# model = Wav2Vec2ForCTC.from_pretrained(model_name)
model = SourceFileLoader("model", cached_path(hf_bucket_url(model_name,filename="model_handling.py"))).load_module().Wav2Vec2ForCTC.from_pretrained(model_name)

Downloading: 100%|██████████| 263/263 [00:00<00:00, 87.8kB/s]
Downloading: 100%|██████████| 1.14k/1.14k [00:00<00:00, 293kB/s]
Downloading: 100%|██████████| 396/396 [00:00<00:00, 56.6kB/s]
Downloading: 100%|██████████| 30.0/30.0 [00:00<00:00, 30.0kB/s]
Downloading: 100%|██████████| 2.53k/2.53k [00:00<00:00, 1.30MB/s]

Please use `allow_patterns` and `ignore_patterns` instead.
Downloading: 100%|██████████| 863/863 [00:00<00:00, 216kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 78.0/78.0 [00:00<00:00, 13.0kB/s]
Downloading: 0.00B [00:00, ?B/s] | 2/4 [00:05<00:05,  2.51s/it]
Downloading: 100%|██████████| 2.47G/2.47G [24:21<00:00, 1.69MB/s]
Fetching 4 files: 100%|██████████| 4/4 [24:32<00:00, 368.10s/it]
Only 0 unigrams passed as vocabulary

In [6]:
wav, _ = librosa.load("./37303134325f3137.wav", sr = 16000)

In [7]:
input_values = processor.feature_extractor(wav, sampling_rate=16000, return_tensors="pt")
output = model(**input_values)

logits = model(**input_values).logits[0]
pred_ids = torch.argmax(logits, axis=-1)
pred_transcript = processor.tokenizer.decode(pred_ids)
print(f"transcript: {pred_transcript}")

time_offset = model.config.inputs_to_logits_ratio / 16000

outputs = processor.tokenizer.decode(pred_ids, output_word_offsets=True)
lyric_offset = [
    {
        "Label": d["word"],
        "Begin": int(d["start_offset"] * time_offset * 1000),
        "End": int(d["end_offset"] * time_offset * 1000),
}
    for d in outputs.word_offsets
]
print(lyric_offset)
print(processor.decode(output.logits.cpu().detach().numpy()[0], beam_width=100).text)

In [None]:
from transformers.file_utils import cached_path, hf_bucket_url
import os, zipfile
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import kenlm
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
import IPython
cache_dir = './cache/'
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-large-vi", cache_dir=cache_dir)
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-large-vi", cache_dir=cache_dir)
lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vi", filename='vi_lm_4grams.bin')
lm_file = cached_path(lm_file,cache_dir=cache_dir)
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
    zip_ref.extractall(cache_dir)
lm_file = cache_dir + 'vi_lm_4grams.bin'

Some weights of the model checkpoint at nguyenvulebinh/wav2vec2-large-vi were not used when initializing Wav2Vec2ForCTC: ['quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.weight', 'quantizer.codevectors', 'project_q.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-large-vi and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to

EntryNotFoundError: ignored

In [None]:
def get_decoder_ngram_model(tokenizer, ngram_lm_path):
    vocab_dict = tokenizer.get_vocab()
    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
    vocab = [x[1] for x in sort_vocab][:-2]
    vocab_list = vocab
    # convert ctc blank character representation
    #vocab_list[tokenizer.pad_token_id] = ""
    # replace special characters
    vocab_list[tokenizer.unk_token_id] = ""
    # vocab_list[tokenizer.bos_token_id] = ""
    # vocab_list[tokenizer.eos_token_id] = ""
    # convert space character representation
    vocab_list[tokenizer.word_delimiter_token_id] = " "
    # specify ctc blank char index, since conventially it is the last entry of the logit matrix
    alphabet = Alphabet.build_alphabet(vocab_list)#, ctc_token_idx=tokenizer.pad_token_id)
    lm_model = kenlm.Model(ngram_lm_path)
    decoder = BeamSearchDecoderCTC(alphabet,
                                   language_model=LanguageModel(lm_model))
    return decoder
ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)



In [None]:
beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
print("Beam search output: {}".format(beam_search_output))

ValueError: ignored